1 /* $Id: zvrank.c,v 1.1 2003-02-27 22:55:40 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 /* Vector Space Model for Zebra */
26 ** six (seven) letter identifier for weighting schema
27 ** best document weighting:
28 ** tfc nfc (tpc npc) [original naming]
29 ** ntc atc npc apc [SMART naming, used here]
30 ** best query weighting:
31 ** nfx tfx bfx (npx tpx bpx) [original naming]
32 ** atn ntn btn apn npn bpn [SMART naming]
35 #include <math.h> /* for log */
47 double blog2(double x) { /* sometimes log_e or log_10 is used */
55 struct rs_info { /* for result set */
56 int db_docs; /* number of documents in database (collection) */
57 int db_terms; /* number of distinct terms in database */
58 int db_f_max; /* maximum of f_t in database */
59 char *db_f_max_str; /* string (most frequent term) */
61 char rschema[8]; /* name of ranking schema */
64 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
65 void (*d_idf_fct)(void *, void *); /* doc idf function */
66 void (*d_norm_fct)(void *, void *); /* doc normalization function */
68 void (*q_tf_fct)(void *, void *); /* query term frequency function */
69 void (*q_idf_fct)(void *, void *); /* query idf function */
70 void (*q_norm_fct)(void *, void *); /* query normalization function */
72 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
76 typedef struct rs_info *RS;
80 fprintf(stdout, "* RS:\n");
81 fprintf(stdout, " db_docs: %d\n", rs->db_docs);
82 fprintf(stdout, " db_terms: %d\n", rs->db_terms);
83 fprintf(stdout, " f_max: %d\n", rs->db_f_max);
84 fprintf(stdout, " f_max_str: %s\n", rs->db_f_max_str);
85 fprintf(stdout, " veclen: %d\n", rs->veclen);
86 /* rschema implies functions */
87 fprintf(stdout, " rschema: %s\n", rs->rschema);
91 struct ds_info { /* document info */
92 char *docid; /* unique doc identifier */
93 int docno; /* doc number */
94 int doclen; /* document length */
95 int d_f_max; /* maximum number of any term in doc */
96 char *d_f_max_str; /* most frequent term in d */
97 int veclen; /* vector length */
98 struct ts_info *terms;
99 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
101 typedef struct ds_info* DS;
104 fprintf(stdout, " * DS:\n");
105 fprintf(stdout, " docid: %s\n", ds->docid);
106 fprintf(stdout, " docno: %d\n", ds->docno);
107 fprintf(stdout, " doclen: %d\n", ds->doclen);
108 fprintf(stdout, " d_f_max: %d\n", ds->d_f_max);
109 fprintf(stdout, " d_f_max_str:%s\n", ds->d_f_max_str);
110 fprintf(stdout, " veclen: %d\n", ds->veclen);
114 struct ts_info { /* term info */
124 typedef struct ts_info *TS;
127 fprintf(stdout, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f\n",
128 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
137 ** weighting functions
140 /* calculate new term frequency vector */
141 void tf_none(void *rsi, void *dsi) {
149 for (i=0; i < veclen; i++) {
150 freq=ds->terms[i].locc;
151 ds->terms[i].tf=freq;
156 void tf_binary(void *rsi, void *dsi) {
164 for (i=0; i < veclen; i++) {
165 freq=ds->terms[i].locc;
174 void tf_max_norm(void *rsi, void *dsi) {
184 for (i=0; i < veclen; i++) {
185 freq=ds->terms[i].locc;
188 ds->terms[i].tf=freq/tf_max;
195 void tf_aug_norm(void *rsi, void *dsi) {
207 for (i=0; i < veclen; i++) {
208 freq=ds->terms[i].locc;
211 ds->terms[i].tf=K+(1-K)*(freq/tf_max);
218 void tf_square(void *rsi, void *dsi) {
226 for (i=0; i < veclen; i++) {
227 freq=ds->terms[i].locc;
229 ds->terms[i].tf=freq*freq;
236 void tf_log(void *rsi, void *dsi) {
244 for (i=0; i < veclen; i++) {
245 freq=ds->terms[i].locc;
247 ds->terms[i].tf=1+blog2(freq);
254 /* calculate inverse document frequency vector */
255 void idf_none(void *rsi, void *dsi) {
262 for (i=0; i < veclen; i++) {
263 ds->terms[i].idf=1.0;
268 void idf_tfidf(void *rsi, void *dsi) {
277 num_docs=rs->db_docs;
278 for (i=0; i < veclen; i++) {
279 gocc=ds->terms[i].gocc;
283 idf=blog2(num_docs/gocc);
284 ds->terms[i].idf=idf;
289 void idf_prob(void *rsi, void *dsi) {
298 num_docs=rs->db_docs;
299 for (i=0; i < veclen; i++) {
300 gocc=ds->terms[i].gocc;
304 idf=(num_docs-gocc)/gocc;
305 ds->terms[i].idf=idf;
310 void idf_freq(void *rsi, void *dsi) {
319 num_docs=rs->db_docs;
324 for (i=0; i < veclen; i++) {
325 // gocc=ds->terms[i].gocc;
326 ds->terms[i].idf=idf;
331 void idf_squared(void *rsi, void *dsi) {
340 num_docs=rs->db_docs;
341 for (i=0; i < veclen; i++) {
342 gocc=ds->terms[i].gocc;
346 idf=blog2(num_docs/gocc);
348 ds->terms[i].idf=idf;
353 /* calculate normalized weight (tf-idf) vector */
354 void norm_none(void *rsi, void *dsi) {
360 for (i=0; i < veclen; i++) {
361 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
366 void norm_sum(void *rsi, void *dsi) {
373 for (i=0; i < veclen; i++) {
374 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
375 tfs+=ds->terms[i].wt;
377 for (i=0; i < veclen; i++) {
379 ds->terms[i].wt=ds->terms[i].wt/tfs;
384 void norm_cosine(void *rsi, void *dsi) {
391 for (i=0; i < veclen; i++) {
392 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
393 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
395 for (i=0; i < veclen; i++) {
397 ds->terms[i].wt=ds->terms[i].wt/tfs;
402 void norm_fourth(void *rsi, void *dsi) {
409 for (i=0; i < veclen; i++) {
410 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
411 fr=(ds->terms[i].wt*ds->terms[i].wt);
415 for (i=0; i < veclen; i++) {
417 ds->terms[i].wt=ds->terms[i].wt/tfs;
422 void norm_max(void *rsi, void *dsi) {
429 for (i=0; i < veclen; i++) {
430 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
431 if (ds->terms[i].wt > tfm)
434 for (i=0; i < veclen; i++) {
436 ds->terms[i].wt=ds->terms[i].wt/tfm;
441 /* add: norm_pivot, ... */
443 double sim_cosine(void *dsi1, void *dsi2) {
447 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
450 veclen=ds1->veclen; /* and ds2->veclen */
451 for (i=0; i < veclen; i++) {
458 sdiv=sqrt(sqr11*sqr22);
464 /* add: norm_jaccard, norm_dice, ... */
466 /* end weighting functions */
470 /* best-fully-weighted */
471 const char* def_rschema="ntc-atn";
474 void zv_init_schema(RS, const char*);
476 void zv_init(RS rs) {
477 char *sname="ntc-atn";/* obtain from configuration file */
478 fprintf(stdout, "zv_init\n");
480 rs->db_docs=100000; /* assign correct value here */
481 rs->db_terms=500000; /* assign correct value here */
482 rs->db_f_max=50; /* assign correct value here */
483 rs->db_f_max_str="a"; /* assign correct value here */
484 zv_init_schema(rs, sname);
488 void zv_init_schema(RS rs, const char *sname) {
490 char c0, c1, c2, c3, c4, c5, c6;
492 fprintf(stdout, "zv_init_schema\n");
494 if (slen>0) c0=sname[0]; else c0=def_rschema[0];
495 if (slen>0) c1=sname[1]; else c0=def_rschema[1];
496 if (slen>0) c2=sname[2]; else c0=def_rschema[2];
498 if (slen>0) c4=sname[4]; else c0=def_rschema[4];
499 if (slen>0) c5=sname[5]; else c0=def_rschema[5];
500 if (slen>0) c6=sname[6]; else c0=def_rschema[6];
502 /* assign doc functions */
505 rs->d_tf_fct=tf_binary;
509 rs->d_tf_fct=tf_max_norm;
513 rs->d_tf_fct=tf_aug_norm;
517 rs->d_tf_fct=tf_square;
525 rs->d_tf_fct=tf_none;
530 rs->d_idf_fct=idf_tfidf;
534 rs->d_idf_fct=idf_prob;
538 rs->d_idf_fct=idf_freq;
542 rs->d_idf_fct=idf_squared;
546 rs->d_idf_fct=idf_none;
551 rs->d_norm_fct=norm_sum;
555 rs->d_norm_fct=norm_cosine;
559 rs->d_norm_fct=norm_fourth;
563 rs->d_norm_fct=norm_max;
567 rs->d_norm_fct=norm_none;
572 /* assign query functions */
575 rs->q_tf_fct=tf_binary;
579 rs->q_tf_fct=tf_max_norm;
583 rs->q_tf_fct=tf_aug_norm;
587 rs->q_tf_fct=tf_square;
595 rs->q_tf_fct=tf_none;
600 rs->q_idf_fct=idf_tfidf;
604 rs->q_idf_fct=idf_prob;
608 rs->q_idf_fct=idf_freq;
612 rs->q_idf_fct=idf_squared;
616 rs->q_idf_fct=idf_none;
621 rs->q_norm_fct=norm_sum;
625 rs->q_norm_fct=norm_cosine;
629 rs->q_norm_fct=norm_fourth;
633 rs->q_norm_fct=norm_max;
637 rs->q_norm_fct=norm_none;
642 rs->sim_fct=sim_cosine;
643 fprintf(stdout, "zv_schema %s\n", rs->rschema);
650 struct rank_class_info { /* where do we need this ? */
655 struct rank_term_info {
663 struct rank_set_info {
667 struct rank_term_info *entries;
673 * zv_create: Creates/Initialises this rank handler. This routine is
674 * called exactly once. The routine returns the class_handle.
676 static void *zv_create (struct zebra_register *reg) {
677 struct rank_class_info *ci = (struct rank_class_info *)
678 xmalloc (sizeof(*ci));
679 fprintf(stdout, "zv_create\n");
680 logf (LOG_DEBUG, "zv_create");
685 * zv_destroy: Destroys this rank handler. This routine is called
686 * when the handler is no longer needed - i.e. when the server
687 * dies. The class_handle was previously returned by create.
689 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
690 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
691 fprintf(stdout, "zv_destroy\n");
692 logf (LOG_DEBUG, "zv_destroy");
698 * zv_begin: Prepares beginning of "real" ranking. Called once for
699 * each result set. The returned handle is a "set handle" and
700 * will be used in each of the handlers below.
702 static void *zv_begin (struct zebra_register *reg, void *class_handle, RSET rset)
704 struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));
708 logf (LOG_DEBUG, "rank-1 zvbegin");
709 fprintf(stdout, "zv_begin\n");
710 veclen=rset->no_rset_terms; /* smaller vector here */
715 rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));
716 rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);
717 rs->qdoc->veclen=veclen;
719 rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));
720 rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);
721 rs->rdoc->veclen=veclen;
723 si->no_entries = rset->no_rset_terms;
724 si->no_rank_entries = 0;
725 si->entries = (struct rank_term_info *)
726 xmalloc (sizeof(*si->entries)*si->no_entries);
728 /* fprintf(stdout, "zv_begin_init\n"); */
729 for (i = 0; i < rs->veclen; i++)
732 gocc=rset->rset_terms[i]->nn;
733 /* fprintf(stdout, "zv_begin_init i=%d gocc=%d\n", i, gocc); */
734 if (!strncmp (rset->rset_terms[i]->flags, "rank,", 5)) {
735 yaz_log (LOG_LOG, "%s", rset->rset_terms[i]->flags);
736 /*si->entries[i].rank_flag = 1;
737 (si->no_rank_entries)++;
740 /* si->entries[i].rank_flag = 0; */
742 rs->qdoc->terms[i].gocc=gocc;
743 rs->qdoc->terms[i].locc=1; /* assume query has no duplicates */
744 rs->rdoc->terms[i].gocc=gocc;
745 rs->rdoc->terms[i].locc=0;
751 * zv_end: Terminates ranking process. Called after a result set
754 static void zv_end (struct zebra_register *reg, void *rsi)
757 fprintf(stdout, "zv_end\n");
758 logf (LOG_DEBUG, "rank-1 end");
759 xfree(rs->qdoc->terms);
760 xfree(rs->rdoc->terms);
768 * zv_add: Called for each word occurence in a result set. This routine
769 * should be as fast as possible. This routine should "incrementally"
772 static void zv_add (void *rsi, int seqno, int i) {
774 /*logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index);*/
775 /*si->last_pos = seqno;*/
776 rs->rdoc->terms[i].locc++;
780 * zv_calc: Called for each document in a result. This handler should
781 * produce a score based on previous call(s) to the add handler. The
782 * score should be between 0 and 1000. If score cannot be obtained
783 * -1 should be returned.
785 static int zv_calc (void *rsi, int sysno)
787 int i, veclen; //lo, divisor, score = 0;
791 /* fprintf(stdout, "zv_calc\n"); */
796 for (i = 0; i < veclen; i++) {
797 (*rs->q_tf_fct)(rs, rs->qdoc); /* we should actually do this once */
798 (*rs->q_idf_fct)(rs, rs->qdoc);
799 (*rs->q_norm_fct)(rs, rs->qdoc);
801 (*rs->d_tf_fct)(rs, rs->rdoc);
802 (*rs->d_idf_fct)(rs, rs->rdoc);
803 (*rs->d_norm_fct)(rs, rs->rdoc);
805 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
807 score = dscore * 1000;
808 yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score);
812 for (i = 0; i < si->no_entries; i++)
813 si->entries[i].local_occur = 0;
819 * Pseudo-meta code with sequence of calls as they occur in a
820 * server. Handlers are prefixed by --:
836 static struct rank_control rank_control_vsm = {
837 "zvrank", /* "zv_rank", */ /* zvrank */
846 struct rank_control *rankzv_class = &rank_control_vsm;