1 /* $Id: zvrank.c,v 1.4 2003-03-27 10:46:29 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 /* Zebra Vector Space Model RANKing
25 ** six (seven) letter identifier for weighting scheme
26 ** best document weighting:
27 ** tfc nfc (tpc npc) [original naming]
28 ** ntc atc npc apc [SMART naming, used here]
29 ** best query weighting:
30 ** nfx tfx bfx (npx tpx bpx) [original naming]
31 ** atn ntn btn apn npn bpn [SMART naming]
32 ** -> should set zvrank.weighting-scheme to one of
33 ** "ntc-atn", "atc-atn", etc.
36 #include <math.h> /* for log */
48 static double blog(double x) {
49 /* log_2, log_e or log_10 is used, best to change it here if necessary */
52 return log(x); /* / log(base) */
57 struct rank_class_info { /* now we need this */
59 char rscheme[8]; /* name of weighting scheme */
63 struct rs_info { /* for result set */
64 int db_docs; /* number of documents in database (collection) */
65 int db_terms; /* number of distinct terms in database (debugging?) */
66 int db_f_max; /* maximum of f_t in database (debugging?) */
67 char *db_f_max_str; /* string (most frequent term) - for debugging */
69 char rscheme[8]; /* name of weighting scheme */
72 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
73 void (*d_idf_fct)(void *, void *); /* doc idf function */
74 void (*d_norm_fct)(void *, void *); /* doc normalization function */
76 void (*q_tf_fct)(void *, void *); /* query term frequency function */
77 void (*q_idf_fct)(void *, void *); /* query idf function */
78 void (*q_norm_fct)(void *, void *); /* query normalization function */
80 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
84 typedef struct rs_info *RS;
86 static void prn_rs(RS rs) { /* for debugging */
87 yaz_log(LOG_DEBUG, "* RS:");
88 yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs);
89 yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms);
90 yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max);
91 yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);
92 yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen);
93 /* rscheme implies functions */
94 yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme);
98 struct ds_info { /* document info */
99 char *docid; /* unique doc identifier */
100 int docno; /* doc number */
101 int doclen; /* document length */
102 int d_f_max; /* maximum number of any term in doc (needed) */
103 char *d_f_max_str; /* most frequent term in d - for debugging */
104 int veclen; /* vector length */
105 struct ts_info *terms;
106 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
108 typedef struct ds_info* DS;
110 static void prn_ds(DS ds) { /* for debugging */
111 yaz_log(LOG_DEBUG, " * DS:");
112 yaz_log(LOG_DEBUG, " docid: %s", ds->docid);
113 yaz_log(LOG_DEBUG, " docno: %d", ds->docno);
114 yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen);
115 yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max);
116 yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);
117 yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen);
121 struct ts_info { /* term info */
131 typedef struct ts_info *TS;
133 static void prn_ts(TS ts) { /* for debugging */
134 yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
135 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
144 ** weighting functions
145 ** check: RS is not needed anymore
148 /* calculate and store new term frequency vector */
149 static void tf_none(void *rsi, void *dsi) {
152 /* no conversion. 1 <= tf */
154 for (i=0; i < veclen; i++) {
155 freq=ds->terms[i].locc;
156 ds->terms[i].tf=freq;
161 static void tf_binary(void *rsi, void *dsi) {
166 for (i=0; i < veclen; i++) {
167 freq=ds->terms[i].locc;
176 static void tf_max_norm(void *rsi, void *dsi) {
180 /* divide each term by max, so 0 <= tf <= 1 */
181 tf_max=ds->d_f_max; /* largest frequency of t in document */
183 for (i=0; i < veclen; i++) {
184 freq=ds->terms[i].locc;
187 ds->terms[i].tf=freq/tf_max;
194 static void tf_aug_norm(void *rsi, void *dsi) {
199 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
200 tf_max=ds->d_f_max; /* largest frequency of t in document */
202 K=0.5; /* zvrank.const-K */
203 for (i=0; i < veclen; i++) {
204 freq=ds->terms[i].locc;
207 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
214 static void tf_square(void *rsi, void *dsi) {
219 for (i=0; i < veclen; i++) {
220 freq=ds->terms[i].locc;
222 ds->terms[i].tf=freq*freq;
229 static void tf_log(void *rsi, void *dsi) {
234 for (i=0; i < veclen; i++) {
235 freq=ds->terms[i].locc;
237 ds->terms[i].tf=1.0+blog(freq);
244 /* calculate and store inverse document frequency vector */
245 static void idf_none(void *rsi, void *dsi) {
250 for (i=0; i < veclen; i++) {
251 ds->terms[i].idf=1.0;
256 static void idf_tfidf(void *rsi, void *dsi) {
262 /* normal tfidf weight */
264 num_docs=rs->db_docs;
265 for (i=0; i < veclen; i++) {
266 gocc=ds->terms[i].gocc;
270 idf=blog(num_docs/gocc);
271 ds->terms[i].idf=idf;
276 static void idf_prob(void *rsi, void *dsi) {
282 /* probabilistic formulation */
284 num_docs=rs->db_docs;
285 for (i=0; i < veclen; i++) {
286 gocc=ds->terms[i].gocc;
290 idf=blog((num_docs-gocc)/gocc);
291 ds->terms[i].idf=idf;
296 static void idf_freq(void *rsi, void *dsi) {
302 /* frequency formulation */
304 num_docs=rs->db_docs;
309 for (i=0; i < veclen; i++) {
310 ds->terms[i].idf=idf;
315 static void idf_squared(void *rsi, void *dsi) {
323 num_docs=rs->db_docs;
324 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
325 for (i=0; i < veclen; i++) {
326 gocc=ds->terms[i].gocc;
330 idf=blog(num_docs/gocc);
332 ds->terms[i].idf=idf;
337 /* calculate and store normalized weight (tf-idf) vector */
338 static void norm_none(void *rsi, void *dsi) {
341 /* no normalization */
343 for (i=0; i < veclen; i++) {
344 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
349 static void norm_sum(void *rsi, void *dsi) {
355 for (i=0; i < veclen; i++) {
356 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
357 tfs+=ds->terms[i].wt;
360 for (i=0; i < veclen; i++) {
361 ds->terms[i].wt=ds->terms[i].wt/tfs;
363 /* else: tfs==0 && ds->terms[i].wt==0 */
367 static void norm_cosine(void *rsi, void *dsi) {
373 for (i=0; i < veclen; i++) {
374 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
375 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
379 for (i=0; i < veclen; i++) {
380 ds->terms[i].wt=ds->terms[i].wt/tfs;
382 /* else: tfs==0 && ds->terms[i].wt==0 */
386 static void norm_fourth(void *rsi, void *dsi) {
392 for (i=0; i < veclen; i++) {
393 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
394 fr=(ds->terms[i].wt*ds->terms[i].wt);
399 for (i=0; i < veclen; i++) {
400 ds->terms[i].wt=ds->terms[i].wt/tfs;
402 /* else: tfs==0 && ds->terms[i].wt==0 */
406 static void norm_max(void *rsi, void *dsi) {
412 for (i=0; i < veclen; i++) {
413 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
414 if (ds->terms[i].wt > tfm)
418 for (i=0; i < veclen; i++) {
419 ds->terms[i].wt=ds->terms[i].wt/tfm;
421 /* else: tfs==0 && ds->terms[i].wt==0 */
425 /* add: norm_pivot, ... */
427 static double sim_cosine(void *dsi1, void *dsi2) {
431 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
434 veclen=ds1->veclen; /* and ds2->veclen */
435 for (i=0; i < veclen; i++) {
442 sdiv=sqrt(sqr11*sqr22);
448 /* add: norm_jaccard, norm_dice, ... */
450 /* end weighting functions */
454 static void zv_init_scheme(RS rs, const char *sname) {
456 char c0, c1, c2, c3, c4, c5, c6;
457 const char *def_rscheme="ntc-atn"; /* a good default */
459 yaz_log(LOG_DEBUG, "zv_init_scheme");
462 yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);
463 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
464 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
465 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
467 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
468 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
469 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
471 /* assign doc functions */
474 rs->d_tf_fct=tf_binary;
478 rs->d_tf_fct=tf_max_norm;
480 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
483 rs->d_tf_fct=tf_aug_norm;
485 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
488 rs->d_tf_fct=tf_square;
496 rs->d_tf_fct=tf_none;
501 rs->d_idf_fct=idf_tfidf;
503 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
506 rs->d_idf_fct=idf_prob;
508 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
511 rs->d_idf_fct=idf_freq;
513 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
516 rs->d_idf_fct=idf_squared;
518 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
521 rs->d_idf_fct=idf_none;
526 rs->d_norm_fct=norm_sum;
530 rs->d_norm_fct=norm_cosine;
534 rs->d_norm_fct=norm_fourth;
538 rs->d_norm_fct=norm_max;
542 rs->d_norm_fct=norm_none;
547 /* assign query functions */
550 rs->q_tf_fct=tf_binary;
554 rs->q_tf_fct=tf_max_norm;
555 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
559 rs->q_tf_fct=tf_aug_norm;
561 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
564 rs->q_tf_fct=tf_square;
572 rs->q_tf_fct=tf_none;
577 rs->q_idf_fct=idf_tfidf;
579 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
582 rs->q_idf_fct=idf_prob;
584 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
587 rs->q_idf_fct=idf_freq;
589 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
592 rs->q_idf_fct=idf_squared;
594 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
597 rs->q_idf_fct=idf_none;
602 rs->q_norm_fct=norm_sum;
606 rs->q_norm_fct=norm_cosine;
610 rs->q_norm_fct=norm_fourth;
614 rs->q_norm_fct=norm_max;
618 rs->q_norm_fct=norm_none;
623 rs->sim_fct=sim_cosine;
624 yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);
628 static void zv_init(RS rs, const char *rscheme) {
629 yaz_log(LOG_DEBUG, "zv_init");
631 rs->db_docs=100000; /* assign correct value here */
632 rs->db_terms=500000; /* assign correct value here (for debugging) */
633 rs->db_f_max=50; /* assign correct value here */
634 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
635 zv_init_scheme(rs, rscheme);
642 * zv_create: Creates/Initialises this rank handler. This routine is
643 * called exactly once. The routine returns the class_handle.
645 static void *zv_create (ZebraHandle zh) {
649 struct rank_class_info *ci = (struct rank_class_info *)
650 xmalloc (sizeof(*ci));
651 yaz_log(LOG_DEBUG, "zv_create");
652 wscheme=res_get(res, "zvrank.weighting-scheme");
653 for (i=0; (i < strlen(wscheme)) && (i < 8); i++)
654 ci->rscheme[i]=wscheme[i];
659 * zv_destroy: Destroys this rank handler. This routine is called
660 * when the handler is no longer needed - i.e. when the server
661 * dies. The class_handle was previously returned by create.
663 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
664 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
665 yaz_log(LOG_DEBUG, "zv_destroy");
671 * zv_begin: Prepares beginning of "real" ranking. Called once for
672 * each result set. The returned handle is a "set handle" and
673 * will be used in each of the handlers below.
675 static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset)
677 struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));
678 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
682 yaz_log(LOG_DEBUG, "zv_begin");
683 veclen=rset->no_rset_terms; /* smaller vector here */
684 zv_init(rs, ci->rscheme);
688 rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));
689 rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);
690 rs->qdoc->veclen=veclen;
691 rs->qdoc->d_f_max=1; /* no duplicates */
692 rs->qdoc->d_f_max_str="";
694 rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));
695 rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);
696 rs->rdoc->veclen=veclen;
697 rs->rdoc->d_f_max=10; /* just a guess */
698 rs->rdoc->d_f_max_str="";
699 /* yaz_log(LOG_DEBUG, "zv_begin_init"); */
700 for (i = 0; i < rs->veclen; i++)
702 gocc=rset->rset_terms[i]->nn;
703 /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */
704 rs->qdoc->terms[i].gocc=gocc;
705 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
706 rs->rdoc->terms[i].gocc=gocc;
707 rs->rdoc->terms[i].locc=0;
709 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
710 (*rs->q_idf_fct)(rs, rs->qdoc);
711 (*rs->q_norm_fct)(rs, rs->qdoc);
716 * zv_end: Terminates ranking process. Called after a result set
719 static void zv_end (struct zebra_register *reg, void *rsi)
722 yaz_log(LOG_DEBUG, "zv_end");
723 xfree(rs->qdoc->terms);
724 xfree(rs->rdoc->terms);
732 * zv_add: Called for each word occurence in a result set. This routine
733 * should be as fast as possible. This routine should "incrementally"
736 static void zv_add (void *rsi, int seqno, int i) {
738 /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/
739 rs->rdoc->terms[i].locc++;
743 * zv_calc: Called for each document in a result. This handler should
744 * produce a score based on previous call(s) to the add handler. The
745 * score should be between 0 and 1000. If score cannot be obtained
746 * -1 should be returned.
748 static int zv_calc (void *rsi, int sysno)
754 /* yaz_log(LOG_DEBUG, "zv_calc"); */
759 for (i = 0; i < veclen; i++) {
760 /* qdoc weight has already been calculated */
761 (*rs->d_tf_fct)(rs, rs->rdoc);
762 (*rs->d_idf_fct)(rs, rs->rdoc);
763 (*rs->d_norm_fct)(rs, rs->rdoc);
764 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
766 score = dscore * 1000;
767 yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score);
768 if (score > 1000) /* should not happen */
774 * Pseudo-meta code with sequence of calls as they occur in a
775 * server. Handlers are prefixed by --:
791 static struct rank_control rank_control_vsm = {
801 struct rank_control *rankzv_class = &rank_control_vsm;