1 /* $Id: zvrank.c,v 1.5 2003-05-20 09:43:46 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 /* Zebra Vector Space Model RANKing
25 ** six (seven) letter identifier for weighting scheme
26 ** best document weighting:
27 ** tfc nfc (tpc npc) [original naming]
28 ** ntc atc npc apc [SMART naming, used here]
29 ** best query weighting:
30 ** nfx tfx bfx (npx tpx bpx) [original naming]
31 ** atn ntn btn apn npn bpn [SMART naming]
32 ** -> should set zvrank.weighting-scheme to one of
33 ** "ntc-atn", "atc-atn", etc.
36 #include <math.h> /* for log */
48 static double blog(double x) {
49 /* log_2, log_e or log_10 is used, best to change it here if necessary */
52 return log(x); /* / log(base) */
57 struct rank_class_info { /* now we need this */
59 char rscheme[8]; /* name of weighting scheme */
63 struct rs_info { /* for result set */
64 int db_docs; /* number of documents in database (collection) */
65 int db_terms; /* number of distinct terms in database (debugging?) */
66 int db_f_max; /* maximum of f_t in database (debugging?) */
67 char *db_f_max_str; /* string (most frequent term) - for debugging */
69 char rscheme[8]; /* name of weighting scheme */
72 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
73 void (*d_idf_fct)(void *, void *); /* doc idf function */
74 void (*d_norm_fct)(void *, void *); /* doc normalization function */
76 void (*q_tf_fct)(void *, void *); /* query term frequency function */
77 void (*q_idf_fct)(void *, void *); /* query idf function */
78 void (*q_norm_fct)(void *, void *); /* query normalization function */
80 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
84 typedef struct rs_info *RS;
86 static void prn_rs(RS rs) { /* for debugging */
87 yaz_log(LOG_DEBUG, "* RS:");
88 yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs);
89 yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms);
90 yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max);
91 yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);
92 yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen);
93 /* rscheme implies functions */
94 yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme);
98 struct ds_info { /* document info */
99 char *docid; /* unique doc identifier */
100 int docno; /* doc number */
101 int doclen; /* document length */
102 int d_f_max; /* maximum number of any term in doc (needed) */
103 char *d_f_max_str; /* most frequent term in d - for debugging */
104 int veclen; /* vector length */
105 struct ts_info *terms;
106 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
108 typedef struct ds_info* DS;
111 static void prn_ds(DS ds) { /* for debugging */
112 yaz_log(LOG_DEBUG, " * DS:");
113 yaz_log(LOG_DEBUG, " docid: %s", ds->docid);
114 yaz_log(LOG_DEBUG, " docno: %d", ds->docno);
115 yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen);
116 yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max);
117 yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);
118 yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen);
123 struct ts_info { /* term info */
133 typedef struct ts_info *TS;
136 static void prn_ts(TS ts) { /* for debugging */
137 yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
138 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
148 ** weighting functions
149 ** check: RS is not needed anymore
152 /* calculate and store new term frequency vector */
153 static void tf_none(void *rsi, void *dsi) {
156 /* no conversion. 1 <= tf */
158 for (i=0; i < veclen; i++) {
159 freq=ds->terms[i].locc;
160 ds->terms[i].tf=freq;
165 static void tf_binary(void *rsi, void *dsi) {
170 for (i=0; i < veclen; i++) {
171 freq=ds->terms[i].locc;
180 static void tf_max_norm(void *rsi, void *dsi) {
184 /* divide each term by max, so 0 <= tf <= 1 */
185 tf_max=ds->d_f_max; /* largest frequency of t in document */
187 for (i=0; i < veclen; i++) {
188 freq=ds->terms[i].locc;
191 ds->terms[i].tf=freq/tf_max;
198 static void tf_aug_norm(void *rsi, void *dsi) {
203 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
204 tf_max=ds->d_f_max; /* largest frequency of t in document */
206 K=0.5; /* zvrank.const-K */
207 for (i=0; i < veclen; i++) {
208 freq=ds->terms[i].locc;
211 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
218 static void tf_square(void *rsi, void *dsi) {
223 for (i=0; i < veclen; i++) {
224 freq=ds->terms[i].locc;
226 ds->terms[i].tf=freq*freq;
233 static void tf_log(void *rsi, void *dsi) {
238 for (i=0; i < veclen; i++) {
239 freq=ds->terms[i].locc;
241 ds->terms[i].tf=1.0+blog(freq);
248 /* calculate and store inverse document frequency vector */
249 static void idf_none(void *rsi, void *dsi) {
254 for (i=0; i < veclen; i++) {
255 ds->terms[i].idf=1.0;
260 static void idf_tfidf(void *rsi, void *dsi) {
266 /* normal tfidf weight */
268 num_docs=rs->db_docs;
269 for (i=0; i < veclen; i++) {
270 gocc=ds->terms[i].gocc;
274 idf=blog(num_docs/gocc);
275 ds->terms[i].idf=idf;
280 static void idf_prob(void *rsi, void *dsi) {
286 /* probabilistic formulation */
288 num_docs=rs->db_docs;
289 for (i=0; i < veclen; i++) {
290 gocc=ds->terms[i].gocc;
294 idf=blog((num_docs-gocc)/gocc);
295 ds->terms[i].idf=idf;
300 static void idf_freq(void *rsi, void *dsi) {
306 /* frequency formulation */
308 num_docs=rs->db_docs;
313 for (i=0; i < veclen; i++) {
314 ds->terms[i].idf=idf;
319 static void idf_squared(void *rsi, void *dsi) {
327 num_docs=rs->db_docs;
328 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
329 for (i=0; i < veclen; i++) {
330 gocc=ds->terms[i].gocc;
334 idf=blog(num_docs/gocc);
336 ds->terms[i].idf=idf;
341 /* calculate and store normalized weight (tf-idf) vector */
342 static void norm_none(void *rsi, void *dsi) {
345 /* no normalization */
347 for (i=0; i < veclen; i++) {
348 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
353 static void norm_sum(void *rsi, void *dsi) {
359 for (i=0; i < veclen; i++) {
360 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
361 tfs+=ds->terms[i].wt;
364 for (i=0; i < veclen; i++) {
365 ds->terms[i].wt=ds->terms[i].wt/tfs;
367 /* else: tfs==0 && ds->terms[i].wt==0 */
371 static void norm_cosine(void *rsi, void *dsi) {
377 for (i=0; i < veclen; i++) {
378 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
379 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
383 for (i=0; i < veclen; i++) {
384 ds->terms[i].wt=ds->terms[i].wt/tfs;
386 /* else: tfs==0 && ds->terms[i].wt==0 */
390 static void norm_fourth(void *rsi, void *dsi) {
396 for (i=0; i < veclen; i++) {
397 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
398 fr=(ds->terms[i].wt*ds->terms[i].wt);
403 for (i=0; i < veclen; i++) {
404 ds->terms[i].wt=ds->terms[i].wt/tfs;
406 /* else: tfs==0 && ds->terms[i].wt==0 */
410 static void norm_max(void *rsi, void *dsi) {
416 for (i=0; i < veclen; i++) {
417 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
418 if (ds->terms[i].wt > tfm)
422 for (i=0; i < veclen; i++) {
423 ds->terms[i].wt=ds->terms[i].wt/tfm;
425 /* else: tfs==0 && ds->terms[i].wt==0 */
429 /* add: norm_pivot, ... */
431 static double sim_cosine(void *dsi1, void *dsi2) {
435 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
438 veclen=ds1->veclen; /* and ds2->veclen */
439 for (i=0; i < veclen; i++) {
446 sdiv=sqrt(sqr11*sqr22);
452 /* add: norm_jaccard, norm_dice, ... */
454 /* end weighting functions */
458 static void zv_init_scheme(RS rs, const char *sname) {
460 char c0, c1, c2, c3, c4, c5, c6;
461 const char *def_rscheme="ntc-atn"; /* a good default */
463 yaz_log(LOG_DEBUG, "zv_init_scheme");
466 yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);
467 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
468 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
469 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
471 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
472 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
473 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
475 /* assign doc functions */
478 rs->d_tf_fct=tf_binary;
482 rs->d_tf_fct=tf_max_norm;
484 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
487 rs->d_tf_fct=tf_aug_norm;
489 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
492 rs->d_tf_fct=tf_square;
500 rs->d_tf_fct=tf_none;
505 rs->d_idf_fct=idf_tfidf;
507 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
510 rs->d_idf_fct=idf_prob;
512 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
515 rs->d_idf_fct=idf_freq;
517 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
520 rs->d_idf_fct=idf_squared;
522 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
525 rs->d_idf_fct=idf_none;
530 rs->d_norm_fct=norm_sum;
534 rs->d_norm_fct=norm_cosine;
538 rs->d_norm_fct=norm_fourth;
542 rs->d_norm_fct=norm_max;
546 rs->d_norm_fct=norm_none;
551 /* assign query functions */
554 rs->q_tf_fct=tf_binary;
558 rs->q_tf_fct=tf_max_norm;
559 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
563 rs->q_tf_fct=tf_aug_norm;
565 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
568 rs->q_tf_fct=tf_square;
576 rs->q_tf_fct=tf_none;
581 rs->q_idf_fct=idf_tfidf;
583 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
586 rs->q_idf_fct=idf_prob;
588 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
591 rs->q_idf_fct=idf_freq;
593 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
596 rs->q_idf_fct=idf_squared;
598 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
601 rs->q_idf_fct=idf_none;
606 rs->q_norm_fct=norm_sum;
610 rs->q_norm_fct=norm_cosine;
614 rs->q_norm_fct=norm_fourth;
618 rs->q_norm_fct=norm_max;
622 rs->q_norm_fct=norm_none;
627 rs->sim_fct=sim_cosine;
628 yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);
632 static void zv_init(RS rs, const char *rscheme) {
633 yaz_log(LOG_DEBUG, "zv_init");
635 rs->db_docs=100000; /* assign correct value here */
636 rs->db_terms=500000; /* assign correct value here (for debugging) */
637 rs->db_f_max=50; /* assign correct value here */
638 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
639 zv_init_scheme(rs, rscheme);
646 * zv_create: Creates/Initialises this rank handler. This routine is
647 * called exactly once. The routine returns the class_handle.
649 static void *zv_create (ZebraHandle zh) {
653 struct rank_class_info *ci = (struct rank_class_info *)
654 xmalloc (sizeof(*ci));
655 yaz_log(LOG_DEBUG, "zv_create");
656 wscheme=res_get(res, "zvrank.weighting-scheme");
657 for (i=0; (i < strlen(wscheme)) && (i < 8); i++)
658 ci->rscheme[i]=wscheme[i];
663 * zv_destroy: Destroys this rank handler. This routine is called
664 * when the handler is no longer needed - i.e. when the server
665 * dies. The class_handle was previously returned by create.
667 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
668 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
669 yaz_log(LOG_DEBUG, "zv_destroy");
675 * zv_begin: Prepares beginning of "real" ranking. Called once for
676 * each result set. The returned handle is a "set handle" and
677 * will be used in each of the handlers below.
679 static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset)
681 struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));
682 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
686 yaz_log(LOG_DEBUG, "zv_begin");
687 veclen=rset->no_rset_terms; /* smaller vector here */
688 zv_init(rs, ci->rscheme);
692 rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));
693 rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);
694 rs->qdoc->veclen=veclen;
695 rs->qdoc->d_f_max=1; /* no duplicates */
696 rs->qdoc->d_f_max_str="";
698 rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));
699 rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);
700 rs->rdoc->veclen=veclen;
701 rs->rdoc->d_f_max=10; /* just a guess */
702 rs->rdoc->d_f_max_str="";
703 /* yaz_log(LOG_DEBUG, "zv_begin_init"); */
704 for (i = 0; i < rs->veclen; i++)
706 gocc=rset->rset_terms[i]->nn;
707 /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */
708 rs->qdoc->terms[i].gocc=gocc;
709 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
710 rs->rdoc->terms[i].gocc=gocc;
711 rs->rdoc->terms[i].locc=0;
713 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
714 (*rs->q_idf_fct)(rs, rs->qdoc);
715 (*rs->q_norm_fct)(rs, rs->qdoc);
720 * zv_end: Terminates ranking process. Called after a result set
723 static void zv_end (struct zebra_register *reg, void *rsi)
726 yaz_log(LOG_DEBUG, "zv_end");
727 xfree(rs->qdoc->terms);
728 xfree(rs->rdoc->terms);
736 * zv_add: Called for each word occurence in a result set. This routine
737 * should be as fast as possible. This routine should "incrementally"
740 static void zv_add (void *rsi, int seqno, int i) {
742 /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/
743 rs->rdoc->terms[i].locc++;
747 * zv_calc: Called for each document in a result. This handler should
748 * produce a score based on previous call(s) to the add handler. The
749 * score should be between 0 and 1000. If score cannot be obtained
750 * -1 should be returned.
752 static int zv_calc (void *rsi, int sysno)
758 /* yaz_log(LOG_DEBUG, "zv_calc"); */
763 for (i = 0; i < veclen; i++) {
764 /* qdoc weight has already been calculated */
765 (*rs->d_tf_fct)(rs, rs->rdoc);
766 (*rs->d_idf_fct)(rs, rs->rdoc);
767 (*rs->d_norm_fct)(rs, rs->rdoc);
768 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
770 score = dscore * 1000;
771 yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score);
772 if (score > 1000) /* should not happen */
778 * Pseudo-meta code with sequence of calls as they occur in a
779 * server. Handlers are prefixed by --:
795 static struct rank_control rank_control_vsm = {
805 struct rank_control *rankzv_class = &rank_control_vsm;