From: Marc Cromme Date: Wed, 3 May 2006 09:31:25 +0000 (+0000) Subject: added new ranking function rank-similar in ranksimilar.c X-Git-Tag: before.bug.529~146 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;h=746f36f2b12f2a342dc0213cb03458359a493c8f;hp=74c151fd60c07fe3af5d05262086a2c97922d1f3;p=idzebra-moved-to-github.git added new ranking function rank-similar in ranksimilar.c moved ranking related declarations from header index.h to header rank.h --- diff --git a/index/Makefile.am b/index/Makefile.am index 0ce522a..c5d7540 100644 --- a/index/Makefile.am +++ b/index/Makefile.am @@ -1,17 +1,25 @@ -## $Id: Makefile.am,v 1.37 2006-05-01 08:27:20 adam Exp $ +## $Id: Makefile.am,v 1.38 2006-05-03 09:31:25 marc Exp $ noinst_PROGRAMS = apitest kdump zebrasrv2 zebraidx2 lib_LTLIBRARIES = libidzebra-api.la libidzebra-all.la libidzebra_all_la_SOURCES = -libidzebra_api_la_SOURCES = dir.c dirs.c trav.c kinput.c kcompare.c \ - attribute.c symtab.c recindex.c recstat.c \ - zebraapi.c api_swig.c \ - zinfo.c invstat.c sortidx.c compact.c zsets.c zrpn.c \ - rank1.c trunc.c retrieve.c extract.c rankstatic.c \ - index.h recindex.h recindxp.h reckeys.c reckeys.h \ - zinfo.h zserver.h zvrank.c limit.c kcontrol.c orddict.c orddict.h +libidzebra_api_la_SOURCES = \ + api_swig.c attribute.c \ + compact.c \ + dir.c dirs.c \ + extract.c \ + index.h invstat.c \ + kinput.c kcompare.c kcontrol.c \ + limit.c \ + orddict.c orddict.h \ + rank.h rank1.c ranksimilarity.c rankstatic.c \ + recindex.c recindex.h recindxp.h reckeys.c reckeys.h recstat.c retrieve.c \ + sortidx.c symtab.c \ + trav.c trunc.c \ + zebraapi.c zinfo.c zinfo.h zserver.h zsets.c zrpn.c \ + zvrank.c bin_PROGRAMS = zebraidx zebrasrv zebrash diff --git a/index/index.h b/index/index.h index 25fb160..82480b5 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.158 2006-04-05 02:11:44 adam Exp $ +/* $Id: index.h,v 1.159 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -308,23 +308,6 @@ struct zebra_session { struct zebra_limit *m_limit; }; -struct rank_control { - char *name; - void *(*create)(ZebraHandle zh); - void (*destroy)(struct zebra_register *reg, void *class_handle); - void *(*begin)(struct zebra_register *reg, - void *class_handle, RSET rset, NMEM nmem, - TERMID *terms, int numterms); - /* ### Could add parameters to begin: - * char *index; // author, title, etc. - * int dbsize; // number of records in database - * int rssize; // number of records in result set (estimate?) - */ - void (*end)(struct zebra_register *reg, void *set_handle); - int (*calc)(void *set_handle, zint sysno, zint staticrank, - int *stop_flag); - void (*add)(void *set_handle, int seqno, TERMID term); -}; struct term_set_entry { char *term; @@ -401,17 +384,9 @@ typedef struct attent data1_local_attribute *local_attributes; } attent; -void zebraRankInstall (struct zebra_register *reg, struct rank_control *ctrl); -ZebraRankClass zebraRankLookup (ZebraHandle zh, const char *name); -void zebraRankDestroy (struct zebra_register *reg); - int att_getentbyatt(ZebraHandle zh, attent *res, oid_value set, int att, const char *sattr); -extern struct rank_control *rank_1_class; -extern struct rank_control *rank_zv_class; -extern struct rank_control *rank_static_class; - int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, zebra_snippets *hit_snippet, ODR stream, oid_value input_format, Z_RecordComposition *comp, diff --git a/index/rank.h b/index/rank.h new file mode 100644 index 0000000..4ef5aa1 --- /dev/null +++ b/index/rank.h @@ -0,0 +1,63 @@ +/* $Id: rank.h,v 1.1 2006-05-03 09:31:26 marc Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +#ifndef RANK_H +#define RANK_H + +#include + +YAZ_BEGIN_CDECL + +struct rank_control { + char *name; + void *(*create)(ZebraHandle zh); + void (*destroy)(struct zebra_register *reg, void *class_handle); + void *(*begin)(struct zebra_register *reg, + void *class_handle, RSET rset, NMEM nmem, + TERMID *terms, int numterms); + /* ### Could add parameters to begin: + * char *index; // author, title, etc. + * int dbsize; // number of records in database + * int rssize; // number of records in result set (estimate?) + */ + void (*end)(struct zebra_register *reg, void *set_handle); + int (*calc)(void *set_handle, zint sysno, zint staticrank, + int *stop_flag); + void (*add)(void *set_handle, int seqno, TERMID term); +}; + +void zebraRankInstall (struct zebra_register *reg, struct rank_control *ctrl); +ZebraRankClass zebraRankLookup (ZebraHandle zh, const char *name); +void zebraRankDestroy (struct zebra_register *reg); + +/* declaring externally defined rank class structures */ +/* remember to install rank classes in zebraapi.c as well!! */ +extern struct rank_control *rank_1_class; +extern struct rank_control *rank_zv_class; +extern struct rank_control *rank_static_class; +extern struct rank_control *rank_similarity_class; + + + +YAZ_END_CDECL + +#endif diff --git a/index/rank1.c b/index/rank1.c index 44220c1..2856b4f 100644 --- a/index/rank1.c +++ b/index/rank1.c @@ -1,4 +1,4 @@ -/* $Id: rank1.c,v 1.27 2005-08-19 11:04:23 adam Exp $ +/* $Id: rank1.c,v 1.28 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -30,6 +30,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #endif #include "index.h" +#include "rank.h" static int log_level = 0; static int log_initialized = 0; diff --git a/index/ranksimilarity.c b/index/ranksimilarity.c new file mode 100644 index 0000000..8c822b2 --- /dev/null +++ b/index/ranksimilarity.c @@ -0,0 +1,254 @@ +/* $Id: ranksimilarity.c,v 1.1 2006-05-03 09:31:26 marc Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +#include +#include +#include +#ifdef WIN32 +#include +#endif +#if HAVE_UNISTD_H +#include +#endif + +#include "index.h" +#include "rank.h" + +static int log_level = 0; +static int log_initialized = 0; + +struct ranksimilarity_class_info { + int dummy; +}; + +struct ranksimilarity_term_info { + int local_occur; + zint global_occur; + int global_inv; + int rank_flag; + int rank_weight; + TERMID term; + int term_index; +}; + +struct ranksimilarity_set_info { + int last_pos; + int no_entries; + int no_rank_entries; + struct ranksimilarity_term_info *entries; + NMEM nmem; +}; + + +/* + * create: Creates/Initialises this rank handler. This routine is + * called exactly once. The routine returns the class_handle. + */ +static void *create (ZebraHandle zh) +{ + struct ranksimilarity_class_info *ci = + (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci)); + + if (!log_initialized) + { + log_level = yaz_log_module_level("ranksimilarity"); + log_initialized = 1; + } + yaz_log(log_level, "create()"); + return 0; +} + +/* + * destroy: Destroys this rank handler. This routine is called + * when the handler is no longer needed - i.e. when the server + * dies. The class_handle was previously returned by create. + */ +static void destroy (struct zebra_register *reg, void *class_handle) +{ + struct ranksimilarity_class_info *ci + = (struct ranksimilarity_class_info *) class_handle; + yaz_log(log_level, "destroy()"); + xfree (ci); +} + + +/** + * begin: Prepares beginning of "real" ranking. Called once for + * each result set. The returned handle is a "set handle" and + * will be used in each of the handlers below. + */ +static void *begin (struct zebra_register *reg, + void *class_handle, RSET rset, NMEM nmem, + TERMID *terms, int numterms) +{ + struct ranksimilarity_set_info *si = + (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si)); + int i; + + yaz_log(log_level, "begin()"); + + /* count how many terms are ranked (2=102 or similar) */ + si->no_entries = numterms; + si->no_rank_entries = 0; + si->nmem=nmem; + si->entries = (struct ranksimilarity_term_info *) + nmem_malloc (si->nmem, sizeof(*si->entries)*numterms); + + /* looping all terms in a specific field of query */ + for (i = 0; i < numterms; i++) + { + struct ord_list *ol = terms[i]->ol; + + yaz_log(log_level, "begin() term i=%d flags=%s '%s'", i, + terms[i]->flags, terms[i]->name ); + + for (; ol; ol = ol->next) + { + int index_type = 0; + const char *db = 0; + const char *string_index = 0; + int set = -1; + int use = -1; + + zebraExplain_lookup_ord(reg->zei, + ol->ord, &index_type, &db, &set, &use, + &string_index); + + if (string_index) + yaz_log(log_level, "begin() ord=%d index_type=%c db=%s str-index=%s", + ol->ord, index_type, db, string_index); + else + yaz_log(log_level, "begin() ord=%d index_type=%c db=%s set=%d use=%d", + ol->ord, index_type, db, set, use); + } + if (!strncmp (terms[i]->flags, "rank,", 5)) + (si->no_rank_entries)++; + + /* setting next entry in term */ + terms[i]->rankpriv = &(si->entries[i]); + } + return si; +} + +/* + * end: Terminates ranking process. Called after a result set + * has been ranked. + */ +static void end (struct zebra_register *reg, void *set_handle) +{ + yaz_log(log_level, "end()"); +} + + +/** + * add: Called for each word occurence in a result set. This routine + * should be as fast as possible. This routine should "incrementally" + * update the score. + */ +static void add (void *set_handle, int seqno, TERMID term) +{ + struct ranksimilarity_set_info *si = (struct ranksimilarity_set_info *) set_handle; + struct ranksimilarity_term_info *ti; + assert(si); + if (!term) + { + /* yaz_log(log_level, "add() NULL term"); */ + return; + } + + + ti= (struct ranksimilarity_term_info *) term->rankpriv; + assert(ti); + si->last_pos = seqno; + ti->local_occur++; + /* yaz_log(log_level, "add() seqno=%d term=%s count=%d", + seqno, term->name,ti->local_occur); */ +} + +/* + * calc: Called for each document in a result. This handler should + * produce a score based on previous call(s) to the add handler. The + * score should be between 0 and 1000. If score cannot be obtained + * -1 should be returned. + */ +static int calc (void *set_handle, zint sysno, zint staticrank, + int *stop_flag) +{ + int i, score = 0; + struct ranksimilarity_set_info *si + = (struct ranksimilarity_set_info *) set_handle; + + yaz_log(log_level, "calc()"); + + if (!si->no_rank_entries) + return -1; /* ranking not enabled for any terms */ + + /* here you put in your own score function */ + + + /* reset the counts for the next term */ + for (i = 0; i < si->no_entries; i++) + si->entries[i].local_occur = 0; + + /* if we set *stop_flag = 1, we stop processing (of result set list) */ + /* staticrank = 0 is highest, MAXINT lowest */ + + + /* here goes your formula to compute a scoring function */ + /* you may use all the gathered statistics here */ + + score = INT_MAX - staticrank; /* but score is reverse (logical) */ + + + + return score; +} + +/* + * Pseudo-meta code with sequence of calls as they occur in a + * server. Handlers are prefixed by --: + * + * server init + * -- create + * foreach search + * rank result set + * -- begin + * foreach record + * foreach word + * -- add + * -- calc + * -- end + * -- destroy + * server close + */ + +static struct rank_control rank_control = { + "rank-similarity", + create, + destroy, + begin, + end, + calc, + add, +}; + +struct rank_control *rank_similarity_class = &rank_control; diff --git a/index/rankstatic.c b/index/rankstatic.c index 999aa2b..a937ad7 100644 --- a/index/rankstatic.c +++ b/index/rankstatic.c @@ -1,4 +1,4 @@ -/* $Id: rankstatic.c,v 1.4 2006-03-30 09:52:15 adam Exp $ +/* $Id: rankstatic.c,v 1.5 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -31,6 +31,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #endif #include "index.h" +#include "rank.h" static int log_level = 0; static int log_initialized = 0; diff --git a/index/zebraapi.c b/index/zebraapi.c index efaf443..1b24783 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.214 2006-04-25 19:37:21 adam Exp $ +/* $Id: zebraapi.c,v 1.215 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -36,6 +36,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include #include "index.h" +#include "rank.h" #include "orddict.h" #include #include @@ -377,8 +378,10 @@ struct zebra_register *zebra_register_open(ZebraService zs, const char *name, reg->key_file_no = 0; reg->ptr_i = 0; + /* installing rank classes */ zebraRankInstall (reg, rank_1_class); zebraRankInstall (reg, rank_zv_class); + zebraRankInstall (reg, rank_similarity_class); zebraRankInstall (reg, rank_static_class); recordCompression = res_get_def (res, "recordCompression", "none"); diff --git a/index/zsets.c b/index/zsets.c index a5b6513..98fd1dc 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.99 2006-01-26 22:17:16 adam Exp $ +/* $Id: zsets.c,v 1.100 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -30,6 +30,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #endif #include "index.h" +#include "rank.h" #include #include diff --git a/index/zvrank.c b/index/zvrank.c index caa3001..88433bf 100644 --- a/index/zvrank.c +++ b/index/zvrank.c @@ -1,4 +1,4 @@ -/* $Id: zvrank.c,v 1.19 2005-08-19 11:04:23 adam Exp $ +/* $Id: zvrank.c,v 1.20 2006-05-03 09:31:26 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -52,6 +52,7 @@ fernuni-hagen.de> #endif #include "index.h" +#include "rank.h" static int log_level = 0; static int log_initialized = 0;