src/relevance.c

   1 /* $Id: relevance.c,v 1.11 2007-05-01 05:04:53 quinn Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4 This file is part of Pazpar2.
   5
   6 Pazpar2 is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 2, or (at your option) any later
   9 version.
  10
  11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Pazpar2; see the file LICENSE.  If not, write to the
  18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19 02111-1307, USA.
  20  */
  21
  22 #include <ctype.h>
  23 #include <math.h>
  24 #include <stdlib.h>
  25
  26 #if HAVE_CONFIG_H
  27 #include <cconfig.h>
  28 #endif
  29
  30 #include "relevance.h"
  31 #include "pazpar2.h"
  32
  33 struct relevance
  34 {
  35     int *doc_frequency_vec;
  36     int vec_len;
  37     struct word_trie *wt;
  38     NMEM nmem;
  39 };
  40
  41 // We use this data structure to recognize terms in input records,
  42 // and map them to record term vectors for counting.
  43 struct word_trie
  44 {
  45     struct
  46     {
  47         struct word_trie *child;
  48         int termno;
  49     } list[26];
  50 };
  51
  52 static struct word_trie *create_word_trie_node(NMEM nmem)
  53 {
  54     struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
  55     int i;
  56     for (i = 0; i < 26; i++)
  57     {
  58         res->list[i].child = 0;
  59         res->list[i].termno = -1;
  60     }
  61     return res;
  62 }
  63
  64 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
  65 {
  66
  67     while (*term) {
  68         int c = tolower(*term);
  69         if (c < 'a' || c > 'z')
  70             term++;
  71         else
  72         {
  73             c -= 'a';
  74             if (!*(++term))
  75                 n->list[c].termno = num;
  76             else
  77             {
  78                 if (!n->list[c].child)
  79                 {
  80                     struct word_trie *new = create_word_trie_node(nmem);
  81                     n->list[c].child = new;
  82                 }
  83                 word_trie_addterm(nmem, n->list[c].child, term, num);
  84             }
  85             break;
  86         }
  87     }
  88 }
  89
  90 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
  91
  92 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
  93 {
  94     int c = raw_char(tolower(*word));
  95
  96     if (!*word)
  97         return 0;
  98
  99     word++;
 100     (*skipped)++;
 101     if (!*word || raw_char(*word) < 0)
 102     {
 103         if (t->list[c].termno > 0)
 104             return t->list[c].termno;
 105         else
 106             return 0;
 107     }
 108     else
 109     {
 110         if (t->list[c].child)
 111         {
 112             return word_trie_match(t->list[c].child, word, skipped);
 113         }
 114         else
 115             return 0;
 116     }
 117
 118 }
 119
 120
 121 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
 122 {
 123     struct word_trie *res = create_word_trie_node(nmem);
 124     const char **p;
 125     int i;
 126
 127     for (i = 1, p = terms; *p; p++, i++)
 128         word_trie_addterm(nmem, res, *p, i);
 129     return res;
 130 }
 131
 132 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
 133 {
 134     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
 135     const char **p;
 136     int i;
 137
 138     for (p = terms, i = 0; *p; p++, i++)
 139         ;
 140     res->vec_len = ++i;
 141     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 142     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
 143     res->nmem = nmem;
 144     res->wt = build_word_trie(nmem, terms);
 145     return res;
 146 }
 147
 148 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 149 {
 150     if (!rec->term_frequency_vec)
 151     {
 152         rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
 153         memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
 154     }
 155 }
 156
 157
 158 // FIXME. The definition of a word is crude here.. should support
 159 // some form of localization mechanism?
 160 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 161         const char *words, int multiplier)
 162 {
 163     while (*words)
 164     {
 165         char c;
 166         int res;
 167         int skipped;
 168         while (*words && (c = raw_char(tolower(*words))) < 0)
 169             words++;
 170         if (!*words)
 171             return;
 172         skipped = 0;
 173         if ((res = word_trie_match(r->wt, words, &skipped)))
 174         {
 175             words += skipped;
 176             cluster->term_frequency_vec[res] += multiplier;
 177         }
 178         else
 179         {
 180             while (*words && (c = raw_char(tolower(*words))) >= 0)
 181                 words++;
 182         }
 183         cluster->term_frequency_vec[0]++;
 184     }
 185 }
 186
 187 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 188 {
 189     int i;
 190
 191     for (i = 1; i < r->vec_len; i++)
 192         if (cluster->term_frequency_vec[i] > 0)
 193             r->doc_frequency_vec[i]++;
 194
 195     r->doc_frequency_vec[0]++;
 196 }
 197
 198 #ifdef GAGA
 199 #ifdef FLOAT_REL
 200 static int comp(const void *p1, const void *p2)
 201 {
 202     float res;
 203     struct record **r1 = (struct record **) p1;
 204     struct record **r2 = (struct record **) p2;
 205     res = (*r2)->relevance - (*r1)->relevance;
 206     if (res > 0)
 207         return 1;
 208     else if (res < 0)
 209         return -1;
 210     else
 211         return 0;
 212 }
 213 #else
 214 static int comp(const void *p1, const void *p2)
 215 {
 216     struct record_cluster **r1 = (struct record_cluster **) p1;
 217     struct record_cluster **r2 = (struct record_cluster **) p2;
 218     return (*r2)->relevance - (*r1)->relevance;
 219 }
 220 #endif
 221 #endif
 222
 223 // Prepare for a relevance-sorted read
 224 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 225 {
 226     int i;
 227     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 228
 229     // Calculate document frequency vector for each term.
 230     for (i = 1; i < rel->vec_len; i++)
 231     {
 232         if (!rel->doc_frequency_vec[i])
 233             idfvec[i] = 0;
 234         else
 235         {
 236             // This conditional may be terribly wrong
 237             // It was there to address the situation where vec[0] == vec[i]
 238             // which leads to idfvec[i] == 0... not sure about this
 239             // Traditional TF-IDF may assume that a word that occurs in every
 240             // record is irrelevant, but this is actually something we will
 241             // see a lot
 242             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 243                             rel->doc_frequency_vec[i])) < 0.0000001)
 244                 idfvec[i] = 1;
 245         }
 246     }
 247     // Calculate relevance for each document
 248     for (i = 0; i < reclist->num_records; i++)
 249     {
 250         int t;
 251         struct record_cluster *rec = reclist->flatlist[i];
 252         float relevance;
 253         relevance = 0;
 254         for (t = 1; t < rel->vec_len; t++)
 255         {
 256             float termfreq;
 257             if (!rec->term_frequency_vec[0])
 258                 break;
 259             termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
 260             relevance += termfreq * idfvec[t];
 261         }
 262         rec->relevance = (int) (relevance * 100000);
 263     }
 264 #ifdef GAGA
 265     qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
 266 #endif
 267     reclist->pointer = 0;
 268     xfree(idfvec);
 269 }
 270
 271 /*
 272  * Local variables:
 273  * c-basic-offset: 4
 274  * indent-tabs-mode: nil
 275  * End:
 276  * vim: shiftwidth=4 tabstop=8 expandtab
 277  */