3 The University of Liverpool
5 Modifications to Zebra 1.1 / YAZ 1.7 to enable ranking
8 Copyright (c) 2001-2002 The University of Liverpool. All
11 Licensed under the Academic Free License version 1.1.
12 http://opensource.org/licenses/academic.php
14 $Id: livcode.c,v 1.1 2003-03-26 16:41:48 adam Exp $
31 ** These functions/routines
32 ** 1. reads in and builds a linked list of rank attr/rank score pairs
33 ** 2. expand a simple query into a paired list of complex/simple nodes.
38 struct rstype *next_rsnode ;
42 } rsnode, *refrsnode ;
44 refrsnode start_rsnode = NULL ;
47 ** Function/Routine prototypes
49 static int search_for_score( char *rankstr ) ;
50 static char *search_for_rankstr( int rank ) ;
51 static int search_for_rank( int rank ) ;
52 static refrsnode set_rsnode( int rank, int score ) ;
53 static int read_zrank_file(ZebraHandle zh) ;
55 static void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
56 static void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
57 static Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) ;
58 static Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) ;
59 static Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) ;
60 static Z_Operand *set_operand( Z_Operand *thisop, int newattr ) ;
61 static int check_operand_attrs( Z_Operand *thisop ) ;
65 ** given a rank-string traverse down the linked list ;
66 ** return its score if found otherwise return -1.
68 int search_for_score( char *rankstr )
70 refrsnode node = start_rsnode ;
73 if ( sscanf( rankstr,"%d",&rank ) )
77 if ( node->rank == rank ) return node->score ;
78 node = node->next_rsnode ;
85 ** search_for_rankstr()
86 ** given a rank traverse down the linked list ;
87 ** return its string if found otherwise return NULL.
89 char *search_for_rankstr( int rank )
91 refrsnode node = start_rsnode ;
95 if ( node->rank == rank ) return node->rankstr ;
96 node = node->next_rsnode ;
103 ** given a rank traverse down the linked list ;
104 ** return 1 if found otherwise return 0.
106 int search_for_rank( int rank )
108 refrsnode node = start_rsnode ;
112 if ( node->rank == rank ) return 1 ;
113 node = node->next_rsnode ;
120 ** given a rank and a score, build the rest of the rsnode structure.
122 refrsnode set_rsnode( int rank, int score )
125 refrsnode node = (refrsnode)malloc( sizeof(rsnode) ) ;
128 node->next_rsnode = NULL ;
130 node->score = score ;
132 sprintf( buff,"%d",rank ) ;
133 node->rankstr = (char *)malloc( strlen(buff)+1 ) ;
134 strcpy( node->rankstr, buff ) ;
140 ** read_zrank_file(zh)
141 ** read in the rankfile and build the rank/score linked list ;
142 ** return 0 : can't open the zebra config. file
143 ** return 0 : can't find the rankfile entry in the zebra config. file
144 ** return 0 : can't open the rankfile itself
145 ** return the number of distinct ranks read in.
147 int read_zrank_file(ZebraHandle zh)
150 char line[ LINEMAX ] ;
151 char rname[ LINEMAX ] ;
159 ** open the zebra configuration file and look for the "rankfile:"
160 ** entry which contains the path/name of the rankfile
163 const char *rankfile = res_get_def(zh->res, "rankfile", 0);
164 const char *profilePath = res_get_def(zh->res, "profilePath",
165 DEFAULT_PROFILE_PATH);
169 yaz_log(LOG_LOG, "rankfile entry not found in config file" ) ;
172 ifd = yaz_path_fopen(profilePath, rankfile, "r" ) ;
175 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
177 if ( sscanf( lineptr,"rankfile: %s", rname ) == 1 )
182 ** open the rankfile and read the rank/score pairs
184 ** ignore duplicate ranks
185 ** ignore ranks without +ve scores
189 if ( !(ifd = fopen( rankfile, "r" )) )
191 logf( LOG_LOG, "unable to open rankfile %s",rankfile ) ;
195 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
197 sscanf( lineptr,"%d : %d", &rank,&score ) ;
198 if ( ( score > 0 ) && ( rank != 1016 ) )
200 refrsnode new_rsnode ;
202 if ( search_for_rank( rank ) == 0 )
204 new_rsnode = set_rsnode( rank,score ) ;
205 new_rsnode->next_rsnode = start_rsnode ;
206 start_rsnode = new_rsnode ;
214 yaz_log(LOG_WARN|LOG_ERRNO, "unable to open config file (%s)",
223 ** build an operand "node" - hav to make a complete copy of thisop and
224 ** then insert newattr in the appropriate place
227 Z_Operand *set_operand( Z_Operand *thisop, int newattr )
230 Z_AttributesPlusTerm *attributesplusterm ;
231 Z_AttributeList *attributelist ;
232 Z_AttributeElement *attributeelement ;
233 Z_AttributeElement *attrptr ;
234 Z_AttributeElement **attrptrptr ;
239 operand = (Z_Operand *)
240 malloc( sizeof( Z_Operand ) ) ;
241 attributesplusterm = (Z_AttributesPlusTerm *)
242 malloc( sizeof( Z_AttributesPlusTerm ) ) ;
243 attributelist = (Z_AttributeList *)
244 malloc( sizeof( Z_AttributeList ) ) ;
245 attributeelement = (Z_AttributeElement *)
246 malloc( sizeof( Z_AttributeElement ) ) ;
248 malloc( sizeof( Z_Term ) ) ;
249 general = (Odr_oct *)
250 malloc( sizeof( Odr_oct ) ) ;
252 operand->which = Z_Operand_APT ;
253 operand->u.attributesPlusTerm = attributesplusterm ;
255 attributesplusterm->attributes = attributelist ;
256 attributesplusterm->term = term ;
258 attributelist->num_attributes = thisop->u.attributesPlusTerm->
259 attributes->num_attributes ;
261 attrptr = (Z_AttributeElement *) malloc( sizeof(Z_AttributeElement) *
262 attributelist->num_attributes ) ;
263 attrptrptr = (Z_AttributeElement **) malloc( sizeof(Z_AttributeElement) *
264 attributelist->num_attributes ) ;
266 attributelist->attributes = attrptrptr ;
268 for ( i = 0 ; i < attributelist->num_attributes ; i++ )
270 *attrptr = *thisop->u.attributesPlusTerm->attributes->attributes[i] ;
272 attrptr->attributeType = (int *)malloc( sizeof(int *) ) ;
273 *attrptr->attributeType = *thisop->u.attributesPlusTerm->attributes->
274 attributes[i]->attributeType;
276 attrptr->value.numeric = (int *)malloc( sizeof(int *) ) ;
277 *attrptr->value.numeric = *thisop->u.attributesPlusTerm->attributes->
278 attributes[i]->value.numeric;
280 if ( (*attrptr->attributeType == 1) &&
281 (*attrptr->value.numeric == 1016) )
283 *attrptr->value.numeric = newattr ;
285 *attrptrptr++ = attrptr++ ;
288 term->which = Z_Term_general ;
289 term->u.general = general ;
291 general->len = thisop->u.attributesPlusTerm->term->u.general->len ;
292 general->size = thisop->u.attributesPlusTerm->term->u.general->size ;
293 general->buf = malloc( general->size ) ;
294 strcpy( general->buf,
295 thisop->u.attributesPlusTerm->term->u.general->buf ) ;
302 ** build a complex "node" with two (simple) operand "nodes" as branches
304 Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 )
309 Z_Operator *roperator ;
311 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
312 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
313 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
314 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
316 top->roperator = roperator ;
317 top->roperator->which = Z_Operator_or ;
318 top->roperator->u.op_or = odr_nullval() ;
321 top->s1->which = Z_RPNStructure_simple ;
322 top->s1->u.simple = sim1 ;
325 top->s2->which = Z_RPNStructure_simple ;
326 top->s2->u.simple = sim2 ;
332 ** set_1complex_1operand()
333 ** build a complex "node" with a complex "node" branch and an
334 ** operand "node" branch
336 Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp )
341 Z_Operator *roperator ;
343 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
344 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
345 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
346 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
348 top->roperator = roperator ;
349 top->roperator->which = Z_Operator_or ;
350 top->roperator->u.op_or = odr_nullval() ;
353 top->s1->which = Z_RPNStructure_complex ;
354 top->s1->u.complex = comp ;
357 top->s2->which = Z_RPNStructure_simple ;
358 top->s2->u.simple = simp ;
365 ** expand a simple query into a number of complex queries
367 Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop )
373 ** start_rsnode will be set if we have already read the rankfile
374 ** so don't bother again but we need to know the number of attributes
375 ** in the linked list so traverse it again to find out how many.
379 refrsnode node = start_rsnode ;
383 node = node->next_rsnode ;
388 ** only expand the query if there are 2 or more attributes
392 refrsnode node = start_rsnode ;
396 attr1 = node->rank ; node = node->next_rsnode ;
397 attr2 = node->rank ; node = node->next_rsnode ;
400 ** this is the special case and has to be done first because the
401 ** last complex node in the linear list has two simple nodes whereas
402 ** all the others have a complex and a simple.
404 top = set_2operands( set_operand( thisop,attr1 ),
405 set_operand( thisop,attr2 ) ) ;
408 ** do the rest as complex/simple pairs
412 attr1 = node->rank ; node = node->next_rsnode ;
413 top = set_1complex_1operand( top,set_operand( thisop,attr1 ) ) ;
416 ** finally add the 1016 rank attribute at the top of the tree
418 top = set_1complex_1operand( top,set_operand( thisop,1016 ) ) ;
426 ** check_operand_attrs()
427 ** loop through the attributes of a particular operand
428 ** return 1 if (type==1 && value==1016) && (type==2 && value==102)
429 ** otherwise return 0
431 int check_operand_attrs( Z_Operand *thisop )
433 Z_AttributeElement *attrptr ;
439 numattrs = thisop->u.attributesPlusTerm->attributes->num_attributes ;
441 for ( i = 0 ; i < numattrs ; i++ )
443 attrptr = thisop->u.attributesPlusTerm->attributes->attributes[i] ;
445 if ( (*attrptr->attributeType == 1) &&
446 (*attrptr->value.numeric == 1016) )
449 if ( (*attrptr->attributeType == 2) &&
450 (*attrptr->value.numeric == 102) )
454 return (cond1 & cond2) ;
458 ** convert_simple2complex()
461 void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct )
463 Z_Complex *complex = NULL ;
464 Z_Operand *operand = rpnstruct->u.simple ;
466 if ( check_operand_attrs( operand ) )
468 complex = expand_query(zh, operand ) ;
473 ** Everything is complete so replace the original
474 ** operand with the newly built complex structure
475 ** This is it ... no going back!!
477 rpnstruct->which = Z_RPNStructure_complex ;
478 rpnstruct->u.complex = complex ;
484 ** walk_complex_query()
485 ** recursively traverse the tree expanding any simple queries we find
487 void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct )
489 if ( rpnstruct->which == Z_RPNStructure_simple )
491 convert_simple2complex(zh, rpnstruct ) ;
495 walk_complex_query(zh, rpnstruct->u.complex->s1 ) ;
496 walk_complex_query(zh, rpnstruct->u.complex->s2 ) ;
500 void zebra_livcode_transform(ZebraHandle zh, Z_RPNQuery *query)
503 ** Got a search request,
504 ** 1. if it is a simple query, see if it suitable for expansion
505 ** i.e. the attributes are of the form ...
506 ** (type==1 && value==1016) && (type==2 && value==102)
508 ** 2. if it is complex, traverse the complex query tree and expand
509 ** any simples simples as above
512 Z_RPNStructure *rpnstruct = query->RPNStructure ;
514 if ( rpnstruct->which == Z_RPNStructure_simple )
516 convert_simple2complex(zh, rpnstruct ) ;
518 else if ( rpnstruct->which == Z_RPNStructure_complex )
520 walk_complex_query(zh, rpnstruct ) ;
526 struct rank_class_info {
530 struct rank_term_info {
537 struct rank_set_info {
541 struct rank_term_info *entries;
544 static int log2_int (unsigned g)
553 * create: Creates/Initialises this rank handler. This routine is
554 * called exactly once. The routine returns the class_handle.
556 static void *create (ZebraHandle zh)
558 struct rank_class_info *ci = (struct rank_class_info *)
559 xmalloc (sizeof(*ci));
561 logf (LOG_DEBUG, "livrank create");
563 read_zrank_file(zh) ;
569 * destroy: Destroys this rank handler. This routine is called
570 * when the handler is no longer needed - i.e. when the server
571 * dies. The class_handle was previously returned by create.
573 static void destroy (struct zebra_register *reg, void *class_handle)
575 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
577 logf (LOG_DEBUG, "livrank destroy");
583 * begin: Prepares beginning of "real" ranking. Called once for
584 * each result set. The returned handle is a "set handle" and
585 * will be used in each of the handlers below.
587 static void *begin (struct zebra_register *reg, void *class_handle, RSET rset)
589 struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si));
592 logf (LOG_DEBUG, "livrank begin");
593 si->no_entries = rset->no_rset_terms;
594 si->no_rank_entries = 0;
595 si->entries = (struct rank_term_info *)
596 xmalloc (sizeof(*si->entries)*si->no_entries);
597 for (i = 0; i < si->no_entries; i++)
599 const char *flags = rset->rset_terms[i]->flags;
600 int g = rset->rset_terms[i]->nn;
601 const char *cp = strstr(flags, ",u=");
603 si->entries[i].rank_flag = 1;
606 char *t = search_for_rankstr(atoi(cp+3));
608 si->entries[i].rank_flag = search_for_score(t) ;
610 if ( si->entries[i].rank_flag )
611 (si->no_rank_entries)++;
613 si->entries[i].local_occur = 0;
614 si->entries[i].global_occur = g;
615 si->entries[i].global_inv = 32 - log2_int (g);
616 logf (LOG_DEBUG, "-------- %d ------", 32 - log2_int (g));
622 * end: Terminates ranking process. Called after a result set
625 static void end (struct zebra_register *reg, void *set_handle)
627 struct rank_set_info *si = (struct rank_set_info *) set_handle;
628 logf (LOG_DEBUG, "livrank end");
634 * add: Called for each word occurence in a result set. This routine
635 * should be as fast as possible. This routine should "incrementally"
638 static void add (void *set_handle, int seqno, int term_index)
640 struct rank_set_info *si = (struct rank_set_info *) set_handle;
641 logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index);
642 si->last_pos = seqno;
643 si->entries[term_index].local_occur++;
647 * calc: Called for each document in a result. This handler should
648 * produce a score based on previous call(s) to the add handler. The
649 * score should be between 0 and 1000. If score cannot be obtained
650 * -1 should be returned.
652 static int calc (void *set_handle, int sysno)
654 int i, lo, divisor, score = 0;
655 struct rank_set_info *si = (struct rank_set_info *) set_handle;
657 logf (LOG_DEBUG, "livrank calc sysno=%d", sysno);
659 if (!si->no_rank_entries)
661 for (i = 0; i < si->no_entries; i++)
663 score += si->entries[i].local_occur * si->entries[i].rank_flag ;
665 for (i = 0; i < si->no_entries; i++)
666 if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur))
667 score += (8+log2_int (lo)) * si->entries[i].global_inv;
669 divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries));
670 score = score / divisor;
673 for (i = 0; i < si->no_entries; i++)
674 si->entries[i].local_occur = 0;
679 * Pseudo-meta code with sequence of calls as they occur in a
680 * server. Handlers are prefixed by --:
696 static struct rank_control rank_control = {
706 struct rank_control *rankliv_class = &rank_control;