1 /* $Id: zvrank.c,v 1.7 2004-06-13 18:44:57 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 Zvrank: an experimental ranking algorithm. See doc/zvrank.txt and
25 source in index/zvrank.c. Enable this by using rank: zvrank in zebra.cfg.
26 Contributed by Johannes Leveling <Johannes.Leveling at
30 /* Zebra Vector Space Model RANKing
32 ** six (seven) letter identifier for weighting scheme
33 ** best document weighting:
34 ** tfc nfc (tpc npc) [original naming]
35 ** ntc atc npc apc [SMART naming, used here]
36 ** best query weighting:
37 ** nfx tfx bfx (npx tpx bpx) [original naming]
38 ** atn ntn btn apn npn bpn [SMART naming]
39 ** -> should set zvrank.weighting-scheme to one of
40 ** "ntc-atn", "atc-atn", etc.
43 #include <math.h> /* for log */
55 static double blog(double x) {
56 /* log_2, log_e or log_10 is used, best to change it here if necessary */
59 return log(x); /* / log(base) */
64 struct rank_class_info { /* now we need this */
66 char rscheme[8]; /* name of weighting scheme */
70 struct rs_info { /* for result set */
71 int db_docs; /* number of documents in database (collection) */
72 int db_terms; /* number of distinct terms in database (debugging?) */
73 int db_f_max; /* maximum of f_t in database (debugging?) */
74 char *db_f_max_str; /* string (most frequent term) - for debugging */
76 char rscheme[8]; /* name of weighting scheme */
79 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
80 void (*d_idf_fct)(void *, void *); /* doc idf function */
81 void (*d_norm_fct)(void *, void *); /* doc normalization function */
83 void (*q_tf_fct)(void *, void *); /* query term frequency function */
84 void (*q_idf_fct)(void *, void *); /* query idf function */
85 void (*q_norm_fct)(void *, void *); /* query normalization function */
87 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
91 typedef struct rs_info *RS;
93 static void prn_rs(RS rs) { /* for debugging */
94 yaz_log(LOG_DEBUG, "* RS:");
95 yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs);
96 yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms);
97 yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max);
98 yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);
99 yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen);
100 /* rscheme implies functions */
101 yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme);
105 struct ds_info { /* document info */
106 char *docid; /* unique doc identifier */
107 int docno; /* doc number */
108 int doclen; /* document length */
109 int d_f_max; /* maximum number of any term in doc (needed) */
110 char *d_f_max_str; /* most frequent term in d - for debugging */
111 int veclen; /* vector length */
112 struct ts_info *terms;
113 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
115 typedef struct ds_info* DS;
118 static void prn_ds(DS ds) { /* for debugging */
119 yaz_log(LOG_DEBUG, " * DS:");
120 yaz_log(LOG_DEBUG, " docid: %s", ds->docid);
121 yaz_log(LOG_DEBUG, " docno: %d", ds->docno);
122 yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen);
123 yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max);
124 yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);
125 yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen);
130 struct ts_info { /* term info */
140 typedef struct ts_info *TS;
143 static void prn_ts(TS ts) { /* for debugging */
144 yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
145 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
155 ** weighting functions
156 ** check: RS is not needed anymore
159 /* calculate and store new term frequency vector */
160 static void tf_none(void *rsi, void *dsi) {
163 /* no conversion. 1 <= tf */
165 for (i=0; i < veclen; i++) {
166 freq=ds->terms[i].locc;
167 ds->terms[i].tf=freq;
172 static void tf_binary(void *rsi, void *dsi) {
177 for (i=0; i < veclen; i++) {
178 freq=ds->terms[i].locc;
187 static void tf_max_norm(void *rsi, void *dsi) {
191 /* divide each term by max, so 0 <= tf <= 1 */
192 tf_max=ds->d_f_max; /* largest frequency of t in document */
194 for (i=0; i < veclen; i++) {
195 freq=ds->terms[i].locc;
198 ds->terms[i].tf=freq/tf_max;
205 static void tf_aug_norm(void *rsi, void *dsi) {
210 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
211 tf_max=ds->d_f_max; /* largest frequency of t in document */
213 K=0.5; /* zvrank.const-K */
214 for (i=0; i < veclen; i++) {
215 freq=ds->terms[i].locc;
218 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
225 static void tf_square(void *rsi, void *dsi) {
230 for (i=0; i < veclen; i++) {
231 freq=ds->terms[i].locc;
233 ds->terms[i].tf=freq*freq;
240 static void tf_log(void *rsi, void *dsi) {
245 for (i=0; i < veclen; i++) {
246 freq=ds->terms[i].locc;
248 ds->terms[i].tf=1.0+blog(freq);
255 /* calculate and store inverse document frequency vector */
256 static void idf_none(void *rsi, void *dsi) {
261 for (i=0; i < veclen; i++) {
262 ds->terms[i].idf=1.0;
267 static void idf_tfidf(void *rsi, void *dsi) {
273 /* normal tfidf weight */
275 num_docs=rs->db_docs;
276 for (i=0; i < veclen; i++) {
277 gocc=ds->terms[i].gocc;
281 idf=blog(num_docs/gocc);
282 ds->terms[i].idf=idf;
287 static void idf_prob(void *rsi, void *dsi) {
293 /* probabilistic formulation */
295 num_docs=rs->db_docs;
296 for (i=0; i < veclen; i++) {
297 gocc=ds->terms[i].gocc;
301 idf=blog((num_docs-gocc)/gocc);
302 ds->terms[i].idf=idf;
307 static void idf_freq(void *rsi, void *dsi) {
313 /* frequency formulation */
315 num_docs=rs->db_docs;
320 for (i=0; i < veclen; i++) {
321 ds->terms[i].idf=idf;
326 static void idf_squared(void *rsi, void *dsi) {
334 num_docs=rs->db_docs;
335 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
336 for (i=0; i < veclen; i++) {
337 gocc=ds->terms[i].gocc;
341 idf=blog(num_docs/gocc);
343 ds->terms[i].idf=idf;
348 /* calculate and store normalized weight (tf-idf) vector */
349 static void norm_none(void *rsi, void *dsi) {
352 /* no normalization */
354 for (i=0; i < veclen; i++) {
355 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
360 static void norm_sum(void *rsi, void *dsi) {
366 for (i=0; i < veclen; i++) {
367 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
368 tfs+=ds->terms[i].wt;
371 for (i=0; i < veclen; i++) {
372 ds->terms[i].wt=ds->terms[i].wt/tfs;
374 /* else: tfs==0 && ds->terms[i].wt==0 */
378 static void norm_cosine(void *rsi, void *dsi) {
384 for (i=0; i < veclen; i++) {
385 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
386 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
390 for (i=0; i < veclen; i++) {
391 ds->terms[i].wt=ds->terms[i].wt/tfs;
393 /* else: tfs==0 && ds->terms[i].wt==0 */
397 static void norm_fourth(void *rsi, void *dsi) {
403 for (i=0; i < veclen; i++) {
404 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
405 fr=(ds->terms[i].wt*ds->terms[i].wt);
410 for (i=0; i < veclen; i++) {
411 ds->terms[i].wt=ds->terms[i].wt/tfs;
413 /* else: tfs==0 && ds->terms[i].wt==0 */
417 static void norm_max(void *rsi, void *dsi) {
423 for (i=0; i < veclen; i++) {
424 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
425 if (ds->terms[i].wt > tfm)
429 for (i=0; i < veclen; i++) {
430 ds->terms[i].wt=ds->terms[i].wt/tfm;
432 /* else: tfs==0 && ds->terms[i].wt==0 */
436 /* add: norm_pivot, ... */
438 static double sim_cosine(void *dsi1, void *dsi2) {
442 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
445 veclen=ds1->veclen; /* and ds2->veclen */
446 for (i=0; i < veclen; i++) {
453 sdiv=sqrt(sqr11*sqr22);
459 /* add: norm_jaccard, norm_dice, ... */
461 /* end weighting functions */
465 static void zv_init_scheme(RS rs, const char *sname) {
467 char c0, c1, c2, c3, c4, c5, c6;
468 const char *def_rscheme="ntc-atn"; /* a good default */
470 yaz_log(LOG_DEBUG, "zv_init_scheme");
473 yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);
474 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
475 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
476 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
478 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
479 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
480 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
482 /* assign doc functions */
485 rs->d_tf_fct=tf_binary;
489 rs->d_tf_fct=tf_max_norm;
491 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
494 rs->d_tf_fct=tf_aug_norm;
496 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
499 rs->d_tf_fct=tf_square;
507 rs->d_tf_fct=tf_none;
512 rs->d_idf_fct=idf_tfidf;
514 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
517 rs->d_idf_fct=idf_prob;
519 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
522 rs->d_idf_fct=idf_freq;
524 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
527 rs->d_idf_fct=idf_squared;
529 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
532 rs->d_idf_fct=idf_none;
537 rs->d_norm_fct=norm_sum;
541 rs->d_norm_fct=norm_cosine;
545 rs->d_norm_fct=norm_fourth;
549 rs->d_norm_fct=norm_max;
553 rs->d_norm_fct=norm_none;
558 /* assign query functions */
561 rs->q_tf_fct=tf_binary;
565 rs->q_tf_fct=tf_max_norm;
566 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
570 rs->q_tf_fct=tf_aug_norm;
572 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
575 rs->q_tf_fct=tf_square;
583 rs->q_tf_fct=tf_none;
588 rs->q_idf_fct=idf_tfidf;
590 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
593 rs->q_idf_fct=idf_prob;
595 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
598 rs->q_idf_fct=idf_freq;
600 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
603 rs->q_idf_fct=idf_squared;
605 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
608 rs->q_idf_fct=idf_none;
613 rs->q_norm_fct=norm_sum;
617 rs->q_norm_fct=norm_cosine;
621 rs->q_norm_fct=norm_fourth;
625 rs->q_norm_fct=norm_max;
629 rs->q_norm_fct=norm_none;
634 rs->sim_fct=sim_cosine;
635 yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);
639 static void zv_init(RS rs, const char *rscheme) {
640 yaz_log(LOG_DEBUG, "zv_init");
642 rs->db_docs=100000; /* assign correct value here */
643 rs->db_terms=500000; /* assign correct value here (for debugging) */
644 rs->db_f_max=50; /* assign correct value here */
645 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
646 zv_init_scheme(rs, rscheme);
653 * zv_create: Creates/Initialises this rank handler. This routine is
654 * called exactly once. The routine returns the class_handle.
656 static void *zv_create (ZebraHandle zh) {
660 struct rank_class_info *ci = (struct rank_class_info *)
661 xmalloc (sizeof(*ci));
662 yaz_log(LOG_DEBUG, "zv_create");
663 wscheme=res_get_def(res, "zvrank.weighting-scheme", "");
664 for (i=0; wscheme[i] && i < 8; i++)
665 ci->rscheme[i]=wscheme[i];
666 ci->rscheme[i] = '\0';
671 * zv_destroy: Destroys this rank handler. This routine is called
672 * when the handler is no longer needed - i.e. when the server
673 * dies. The class_handle was previously returned by create.
675 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
676 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
677 yaz_log(LOG_DEBUG, "zv_destroy");
683 * zv_begin: Prepares beginning of "real" ranking. Called once for
684 * each result set. The returned handle is a "set handle" and
685 * will be used in each of the handlers below.
687 static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset)
689 struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));
690 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
694 yaz_log(LOG_DEBUG, "zv_begin");
695 veclen=rset->no_rset_terms; /* smaller vector here */
696 zv_init(rs, ci->rscheme);
700 rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));
701 rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);
702 rs->qdoc->veclen=veclen;
703 rs->qdoc->d_f_max=1; /* no duplicates */
704 rs->qdoc->d_f_max_str="";
706 rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));
707 rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);
708 rs->rdoc->veclen=veclen;
709 rs->rdoc->d_f_max=10; /* just a guess */
710 rs->rdoc->d_f_max_str="";
711 /* yaz_log(LOG_DEBUG, "zv_begin_init"); */
712 for (i = 0; i < rs->veclen; i++)
714 gocc=rset->rset_terms[i]->nn;
715 /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */
716 rs->qdoc->terms[i].gocc=gocc;
717 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
718 rs->rdoc->terms[i].gocc=gocc;
719 rs->rdoc->terms[i].locc=0;
721 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
722 (*rs->q_idf_fct)(rs, rs->qdoc);
723 (*rs->q_norm_fct)(rs, rs->qdoc);
728 * zv_end: Terminates ranking process. Called after a result set
731 static void zv_end (struct zebra_register *reg, void *rsi)
734 yaz_log(LOG_DEBUG, "zv_end");
735 xfree(rs->qdoc->terms);
736 xfree(rs->rdoc->terms);
744 * zv_add: Called for each word occurence in a result set. This routine
745 * should be as fast as possible. This routine should "incrementally"
748 static void zv_add (void *rsi, int seqno, int i) {
750 /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/
751 rs->rdoc->terms[i].locc++;
755 * zv_calc: Called for each document in a result. This handler should
756 * produce a score based on previous call(s) to the add handler. The
757 * score should be between 0 and 1000. If score cannot be obtained
758 * -1 should be returned.
760 static int zv_calc (void *rsi, int sysno)
766 /* yaz_log(LOG_DEBUG, "zv_calc"); */
771 for (i = 0; i < veclen; i++) {
772 /* qdoc weight has already been calculated */
773 (*rs->d_tf_fct)(rs, rs->rdoc);
774 (*rs->d_idf_fct)(rs, rs->rdoc);
775 (*rs->d_norm_fct)(rs, rs->rdoc);
776 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
778 score = dscore * 1000;
779 yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score);
780 if (score > 1000) /* should not happen */
786 * Pseudo-meta code with sequence of calls as they occur in a
787 * server. Handlers are prefixed by --:
803 static struct rank_control rank_control_vsm = {
813 struct rank_control *rankzv_class = &rank_control_vsm;