1 /* $Id: ranksimilarity.c,v 1.2 2006-05-03 13:55:20 marc Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
36 static int log_level = 0;
37 static int log_initialized = 0;
39 struct ranksimilarity_class_info {
43 struct ranksimilarity_term_info {
44 /* frequency of term within document */
46 /* frequency of term within result set */
47 zint freq_term_resset;
48 /* rank flag is one if term is to be included in ranking */
50 /* relative ranking weight of term */
56 struct ranksimilarity_set_info {
60 struct ranksimilarity_term_info *entries;
65 /* local clean-up function */
66 static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si)
70 for (i = 0; i < si->no_entries; i++){
71 si->entries[i].freq_term_doc = 0;
77 * create: Creates/Initialises this rank handler. This routine is
78 * called exactly once. The routine returns the class_handle.
80 static void *create (ZebraHandle zh)
82 struct ranksimilarity_class_info *ci =
83 (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci));
87 log_level = yaz_log_module_level("rank-similarity");
90 yaz_log(log_level, "create()");
95 * destroy: Destroys this rank handler. This routine is called
96 * when the handler is no longer needed - i.e. when the server
97 * dies. The class_handle was previously returned by create.
99 static void destroy (struct zebra_register *reg, void *class_handle)
101 struct ranksimilarity_class_info *ci
102 = (struct ranksimilarity_class_info *) class_handle;
103 yaz_log(log_level, "destroy()");
109 * begin: Prepares beginning of "real" ranking. Called once for
110 * each result set. The returned handle is a "set handle" and
111 * will be used in each of the handlers below.
113 static void *begin (struct zebra_register *reg,
114 void *class_handle, RSET rset, NMEM nmem,
115 TERMID *terms, int numterms)
117 struct ranksimilarity_set_info *si =
118 (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si));
121 yaz_log(log_level, "begin() numterms=%d", numterms);
123 si->no_entries = numterms;
124 /* count how many terms are ranked (2=102 or similar) */
125 si->no_rank_entries = 0;
127 si->entries = (struct ranksimilarity_term_info *)
128 nmem_malloc (si->nmem, sizeof(*si->entries)*numterms);
130 /* reset the counts for the next term */
131 ranksimilar_rec_reset(si);
134 /* looping all terms in a specific field of query */
135 for (i = 0; i < numterms; i++)
137 struct ord_list *ol = NULL;
140 /* adding to number of rank entries */
141 if (strncmp (terms[i]->flags, "rank,", 5))
143 si->entries[i].rank_flag = 0;
144 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked",
145 i, terms[i]->name, terms[i]->flags);
149 const char *cp = strstr(terms[i]->flags+4, ",w=");
151 (si->no_rank_entries)++;
153 si->entries[i].rank_flag = 1;
154 si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
156 si->entries[i].term_weight = atoi (cp+3);
158 si->entries[i].term_weight = 34; /* sqrroot of 1000 */
160 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
161 i, terms[i]->name, terms[i]->flags);
163 /* looping indexes where term terms[i] is found */
164 for (; ol; ol = ol->next)
168 const char *string_index = 0;
172 zebraExplain_lookup_ord(reg->zei,
173 ol->ord, &index_type, &db, &set, &use,
178 "begin() index: ord=%d type=%c db=%s str-index=%s",
179 ol->ord, index_type, db, string_index);
182 "begin() index: ord=%d type=%c db=%s set=%d use=%d",
183 ol->ord, index_type, db, set, use);
188 si->entries[i].term = terms[i];
189 si->entries[i].term_index=i;
191 /* setting next entry in term */
192 terms[i]->rankpriv = &(si->entries[i]);
199 * end: Terminates ranking process. Called after a result set
202 static void end (struct zebra_register *reg, void *set_handle)
204 yaz_log(log_level, "end()");
209 * add: Called for each word occurence in a result set. This routine
210 * should be as fast as possible. This routine should "incrementally"
213 static void add (void *set_handle, int seqno, TERMID term)
215 struct ranksimilarity_set_info *si
216 = (struct ranksimilarity_set_info *) set_handle;
217 struct ranksimilarity_term_info *ti;
221 yaz_log(log_level, "add() seqno=%d NULL term", seqno);
225 ti= (struct ranksimilarity_term_info *) term->rankpriv;
227 si->last_pos = seqno;
229 yaz_log(log_level, "add() seqno=%d term=%s freq_term_doc=%d",
230 seqno, term->name, ti->freq_term_doc);
234 * calc: Called for each document in a result. This handler should
235 * produce a score based on previous call(s) to the add handler. The
236 * score should be between 0 and 1000. If score cannot be obtained
237 * -1 should be returned.
239 static int calc (void *set_handle, zint sysno, zint staticrank,
243 struct ranksimilarity_set_info *si
244 = (struct ranksimilarity_set_info *) set_handle;
247 yaz_log(log_level, "calc() sysno = %d", sysno);
248 yaz_log(log_level, "calc() staticrank = %d", staticrank);
250 yaz_log(log_level, "calc() si->no_entries = %d",
252 yaz_log(log_level, "calc() si->no_rank_entries = %d",
253 si->no_rank_entries);
256 if (!si->no_rank_entries)
257 return -1; /* ranking not enabled for any terms */
260 /* if we set *stop_flag = 1, we stop processing (of result set list) */
263 /* here goes your formula to compute a scoring function */
264 /* you may use all the gathered statistics here */
265 for (i = 0; i < si->no_entries; i++)
267 yaz_log(log_level, "calc() entries[%d] termid %d",
268 i, si->entries[i].term);
269 if (si->entries[i].term){
270 yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
271 i, si->entries[i].term->name, si->entries[i].term->flags);
272 yaz_log(log_level, "calc() entries[%d] rank_flag %d",
273 i, si->entries[i].rank_flag );
274 yaz_log(log_level, "calc() entries[%d] term_weight %d",
275 i, si->entries[i].term_weight );
276 yaz_log(log_level, "calc() entries[%d] freq_term_doc %d",
277 i, si->entries[i].freq_term_doc );
278 yaz_log(log_level, "calc() entries[%d] freq_term_resset %d",
279 i, si->entries[i].freq_term_resset );
284 /* reset the counts for the next term */
285 ranksimilar_rec_reset(si);
288 /* staticrank = 0 is highest, MAXINT lowest */
289 score = INT_MAX - staticrank; /* but score is reverse (logical) */
292 /* debugging statistics output */
293 yaz_log(log_level, "calc() statistics: score = %d", score);
299 * Pseudo-meta code with sequence of calls as they occur in a
300 * server. Handlers are prefixed by --:
316 static struct rank_control rank_control = {
326 struct rank_control *rank_similarity_class = &rank_control;