1 /* $Id: zvrank.c,v 1.12 2004-10-28 10:37:15 heikki Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 Zvrank: an experimental ranking algorithm. See doc/zvrank.txt and
25 source in index/zvrank.c. Enable this by using rank: zvrank in zebra.cfg.
26 Contributed by Johannes Leveling <Johannes.Leveling at
30 /* Zebra Vector Space Model RANKing
32 ** six (seven) letter identifier for weighting scheme
33 ** best document weighting:
34 ** tfc nfc (tpc npc) [original naming]
35 ** ntc atc npc apc [SMART naming, used here]
36 ** best query weighting:
37 ** nfx tfx bfx (npx tpx bpx) [original naming]
38 ** atn ntn btn apn npn bpn [SMART naming]
39 ** -> should set zvrank.weighting-scheme to one of
40 ** "ntc-atn", "atc-atn", etc.
44 #include <math.h> /* for log */
56 static double blog(double x) {
57 /* log_2, log_e or log_10 is used, best to change it here if necessary */
60 return log(x); /* / log(base) */
65 struct rank_class_info {
66 char rscheme[8]; /* name of weighting scheme */
70 struct rs_info { /* for result set */
71 int db_docs; /* number of documents in database (collection) */
72 int db_terms; /* number of distinct terms in database (debugging?) */
73 int db_f_max; /* maximum of f_t in database (debugging?) */
74 char *db_f_max_str; /* string (most frequent term) - for debugging */
76 char rscheme[8]; /* name of weighting scheme */
80 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
81 void (*d_idf_fct)(void *, void *); /* doc idf function */
82 void (*d_norm_fct)(void *, void *); /* doc normalization function */
84 void (*q_tf_fct)(void *, void *); /* query term frequency function */
85 void (*q_idf_fct)(void *, void *); /* query idf function */
86 void (*q_norm_fct)(void *, void *); /* query normalization function */
88 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
92 typedef struct rs_info *RS;
94 static void prn_rs(RS rs) { /* for debugging */
95 yaz_log(LOG_DEBUG, "* RS:");
96 yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs);
97 yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms);
98 yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max);
99 yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);
100 yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen);
101 /* rscheme implies functions */
102 yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme);
106 struct ds_info { /* document info */
107 char *docid; /* unique doc identifier */
108 int docno; /* doc number */
109 int doclen; /* document length */
110 int d_f_max; /* maximum number of any term in doc (needed) */
111 char *d_f_max_str; /* most frequent term in d - for debugging */
112 int veclen; /* vector length */
113 struct ts_info *terms;
114 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
116 typedef struct ds_info* DS;
119 static void prn_ds(DS ds) { /* for debugging */
120 yaz_log(LOG_DEBUG, " * DS:");
121 yaz_log(LOG_DEBUG, " docid: %s", ds->docid);
122 yaz_log(LOG_DEBUG, " docno: %d", ds->docno);
123 yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen);
124 yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max);
125 yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);
126 yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen);
131 struct ts_info { /* term info */
141 typedef struct ts_info *TS;
144 static void prn_ts(TS ts) { /* for debugging */
145 yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
146 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
156 ** weighting functions
157 ** check: RS is not needed anymore
160 /* calculate and store new term frequency vector */
161 static void tf_none(void *rsi, void *dsi) {
164 /* no conversion. 1 <= tf */
166 for (i=0; i < veclen; i++) {
167 freq=ds->terms[i].locc;
168 ds->terms[i].tf=freq;
173 static void tf_binary(void *rsi, void *dsi) {
178 for (i=0; i < veclen; i++) {
179 freq=ds->terms[i].locc;
188 static void tf_max_norm(void *rsi, void *dsi) {
192 /* divide each term by max, so 0 <= tf <= 1 */
193 tf_max=ds->d_f_max; /* largest frequency of t in document */
195 for (i=0; i < veclen; i++) {
196 freq=ds->terms[i].locc;
199 ds->terms[i].tf=freq/tf_max;
206 static void tf_aug_norm(void *rsi, void *dsi) {
211 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
212 tf_max=ds->d_f_max; /* largest frequency of t in document */
214 K=0.5; /* zvrank.const-K */
215 for (i=0; i < veclen; i++) {
216 freq=ds->terms[i].locc;
219 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
226 static void tf_square(void *rsi, void *dsi) {
231 for (i=0; i < veclen; i++) {
232 freq=ds->terms[i].locc;
234 ds->terms[i].tf=freq*freq;
241 static void tf_log(void *rsi, void *dsi) {
246 for (i=0; i < veclen; i++) {
247 freq=ds->terms[i].locc;
249 ds->terms[i].tf=1.0+blog(freq);
256 /* calculate and store inverse document frequency vector */
257 static void idf_none(void *rsi, void *dsi) {
262 for (i=0; i < veclen; i++) {
263 ds->terms[i].idf=1.0;
268 static void idf_tfidf(void *rsi, void *dsi) {
274 /* normal tfidf weight */
276 num_docs=rs->db_docs;
277 for (i=0; i < veclen; i++) {
278 gocc=ds->terms[i].gocc;
282 idf=blog((double) (num_docs/gocc));
283 ds->terms[i].idf=idf;
288 static void idf_prob(void *rsi, void *dsi) {
294 /* probabilistic formulation */
296 num_docs=rs->db_docs;
297 for (i=0; i < veclen; i++) {
298 gocc=ds->terms[i].gocc;
302 idf=blog((double) ((num_docs-gocc)/gocc));
303 ds->terms[i].idf=idf;
308 static void idf_freq(void *rsi, void *dsi) {
314 /* frequency formulation */
316 num_docs=rs->db_docs;
321 for (i=0; i < veclen; i++) {
322 ds->terms[i].idf=idf;
327 static void idf_squared(void *rsi, void *dsi) {
335 num_docs=rs->db_docs;
336 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
337 for (i=0; i < veclen; i++) {
338 gocc=ds->terms[i].gocc;
342 idf=blog(num_docs/gocc);
344 ds->terms[i].idf=idf;
349 /* calculate and store normalized weight (tf-idf) vector */
350 static void norm_none(void *rsi, void *dsi) {
353 /* no normalization */
355 for (i=0; i < veclen; i++) {
356 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
361 static void norm_sum(void *rsi, void *dsi) {
367 for (i=0; i < veclen; i++) {
368 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
369 tfs+=ds->terms[i].wt;
372 for (i=0; i < veclen; i++) {
373 ds->terms[i].wt=ds->terms[i].wt/tfs;
375 /* else: tfs==0 && ds->terms[i].wt==0 */
379 static void norm_cosine(void *rsi, void *dsi) {
385 for (i=0; i < veclen; i++) {
386 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
387 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
391 for (i=0; i < veclen; i++) {
392 ds->terms[i].wt=ds->terms[i].wt/tfs;
394 /* else: tfs==0 && ds->terms[i].wt==0 */
398 static void norm_fourth(void *rsi, void *dsi) {
404 for (i=0; i < veclen; i++) {
405 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
406 fr=(ds->terms[i].wt*ds->terms[i].wt);
411 for (i=0; i < veclen; i++) {
412 ds->terms[i].wt=ds->terms[i].wt/tfs;
414 /* else: tfs==0 && ds->terms[i].wt==0 */
418 static void norm_max(void *rsi, void *dsi) {
424 for (i=0; i < veclen; i++) {
425 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
426 if (ds->terms[i].wt > tfm)
430 for (i=0; i < veclen; i++) {
431 ds->terms[i].wt=ds->terms[i].wt/tfm;
433 /* else: tfs==0 && ds->terms[i].wt==0 */
437 /* add: norm_pivot, ... */
439 static double sim_cosine(void *dsi1, void *dsi2) {
443 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
446 veclen=ds1->veclen; /* and ds2->veclen */
447 for (i=0; i < veclen; i++) {
454 sdiv=sqrt(sqr11*sqr22);
460 /* add: norm_jaccard, norm_dice, ... */
462 /* end weighting functions */
466 static void zv_init_scheme(RS rs, const char *sname) {
468 char c0, c1, c2, c3, c4, c5, c6;
469 const char *def_rscheme="ntc-atn"; /* a good default */
471 yaz_log(LOG_DEBUG, "zv_init_scheme");
474 yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);
475 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
476 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
477 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
479 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
480 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
481 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
483 /* assign doc functions */
486 rs->d_tf_fct=tf_binary;
490 rs->d_tf_fct=tf_max_norm;
492 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
495 rs->d_tf_fct=tf_aug_norm;
497 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
500 rs->d_tf_fct=tf_square;
508 rs->d_tf_fct=tf_none;
513 rs->d_idf_fct=idf_tfidf;
515 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
518 rs->d_idf_fct=idf_prob;
520 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
523 rs->d_idf_fct=idf_freq;
525 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
528 rs->d_idf_fct=idf_squared;
530 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
533 rs->d_idf_fct=idf_none;
538 rs->d_norm_fct=norm_sum;
542 rs->d_norm_fct=norm_cosine;
546 rs->d_norm_fct=norm_fourth;
550 rs->d_norm_fct=norm_max;
554 rs->d_norm_fct=norm_none;
559 /* assign query functions */
562 rs->q_tf_fct=tf_binary;
566 rs->q_tf_fct=tf_max_norm;
567 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
571 rs->q_tf_fct=tf_aug_norm;
573 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
576 rs->q_tf_fct=tf_square;
584 rs->q_tf_fct=tf_none;
589 rs->q_idf_fct=idf_tfidf;
591 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
594 rs->q_idf_fct=idf_prob;
596 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
599 rs->q_idf_fct=idf_freq;
601 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
604 rs->q_idf_fct=idf_squared;
606 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
609 rs->q_idf_fct=idf_none;
614 rs->q_norm_fct=norm_sum;
618 rs->q_norm_fct=norm_cosine;
622 rs->q_norm_fct=norm_fourth;
626 rs->q_norm_fct=norm_max;
630 rs->q_norm_fct=norm_none;
635 rs->sim_fct=sim_cosine;
636 yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);
640 static void zv_init(RS rs, const char *rscheme) {
641 yaz_log(LOG_DEBUG, "zv_init");
643 rs->db_docs=100000; /* assign correct value here */
644 rs->db_terms=500000; /* assign correct value here (for debugging) */
645 rs->db_f_max=50; /* assign correct value here */
646 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
647 /* FIXME - get those values from somewhere */
648 zv_init_scheme(rs, rscheme);
655 * zv_create: Creates/Initialises this rank handler. This routine is
656 * called exactly once. The routine returns the class_handle.
658 static void *zv_create (ZebraHandle zh) {
662 struct rank_class_info *ci = (struct rank_class_info *)
663 xmalloc (sizeof(*ci));
664 yaz_log(LOG_DEBUG, "zv_create");
665 wscheme=res_get_def(res, "zvrank.weighting-scheme", "");
666 for (i=0; wscheme[i] && i < 8; i++)
667 ci->rscheme[i]=wscheme[i];
668 ci->rscheme[i] = '\0';
673 * zv_destroy: Destroys this rank handler. This routine is called
674 * when the handler is no longer needed - i.e. when the server
675 * dies. The class_handle was previously returned by create.
677 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
678 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
679 yaz_log(LOG_DEBUG, "zv_destroy");
685 * zv_begin: Prepares beginning of "real" ranking. Called once for
686 * each result set. The returned handle is a "set handle" and
687 * will be used in each of the handlers below.
689 static void *zv_begin(struct zebra_register *reg, void *class_handle,
690 RSET rset, NMEM nmem, TERMID *terms, int numterms)
692 struct rs_info *rs=(struct rs_info *)nmem_malloc(nmem,sizeof(*rs));
693 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
699 yaz_log(LOG_DEBUG, "zv_begin");
701 zv_init(rs, ci->rscheme);
706 rs->qdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->qdoc));
707 rs->qdoc->terms=(struct ts_info *)nmem_malloc(nmem,
708 sizeof(*rs->qdoc->terms)*rs->veclen);
709 rs->qdoc->veclen=veclen;
710 rs->qdoc->d_f_max=1; /* no duplicates */
711 rs->qdoc->d_f_max_str="";
713 rs->rdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->rdoc));
714 rs->rdoc->terms=(struct ts_info *)nmem_malloc(nmem,
715 sizeof(*rs->rdoc->terms)*rs->veclen);
716 rs->rdoc->veclen=veclen;
717 rs->rdoc->d_f_max=10; /* just a guess */
718 rs->rdoc->d_f_max_str="";
719 /* yaz_log(LOG_DEBUG, "zv_begin_init"); */
720 for (i = 0; i < rs->veclen; i++)
722 gocc= rset_count(terms[i]->rset);
723 terms[i]->rankpriv=ip=nmem_malloc(nmem, sizeof(int));
724 *ip=i; /* save the index for add() */
725 /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */
726 rs->qdoc->terms[i].gocc=gocc;
727 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
728 rs->rdoc->terms[i].gocc=gocc;
729 rs->rdoc->terms[i].locc=0;
731 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
732 (*rs->q_idf_fct)(rs, rs->qdoc);
733 (*rs->q_norm_fct)(rs, rs->qdoc);
738 * zv_end: Terminates ranking process. Called after a result set
741 static void zv_end (struct zebra_register *reg, void *rsi)
743 yaz_log(LOG_DEBUG, "zv_end");
744 /* they all are nmem'd */
749 * zv_add: Called for each word occurence in a result set. This routine
750 * should be as fast as possible. This routine should "incrementally"
753 static void zv_add (void *rsi, int seqno, TERMID term) {
755 int *ip = term->rankpriv;
757 rs->rdoc->terms[i].locc++;
758 yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d '%s' term_index=%d cnt=%d",
759 seqno, term->name, i, rs->rdoc->terms[i].locc );
763 * zv_calc: Called for each document in a result. This handler should
764 * produce a score based on previous call(s) to the add handler. The
765 * score should be between 0 and 1000. If score cannot be obtained
766 * -1 should be returned.
768 static int zv_calc (void *rsi, zint sysno)
774 /* yaz_log(LOG_DEBUG, "zv_calc"); */
779 for (i = 0; i < veclen; i++) {
780 /* qdoc weight has already been calculated */
781 (*rs->d_tf_fct)(rs, rs->rdoc);
782 (*rs->d_idf_fct)(rs, rs->rdoc);
783 (*rs->d_norm_fct)(rs, rs->rdoc);
784 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
786 score = (int) (dscore * 1000 +.5);
787 yaz_log (LOG_DEBUG, "zv_calc: sysno=" ZINT_FORMAT " score=%d",
789 if (score > 1000) /* should not happen */
795 * Pseudo-meta code with sequence of calls as they occur in a
796 * server. Handlers are prefixed by --:
812 static struct rank_control rank_control_vsm = {
822 struct rank_control *rankzv_class = &rank_control_vsm;