2 * $Id: relevance.c,v 1.3 2007-01-03 06:23:44 quinn Exp $
14 int *doc_frequency_vec;
20 // We use this data structure to recognize terms in input records,
21 // and map them to record term vectors for counting.
26 struct word_trie *child;
31 static struct word_trie *create_word_trie_node(NMEM nmem)
33 struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
35 for (i = 0; i < 26; i++)
37 res->list[i].child = 0;
38 res->list[i].termno = -1;
43 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
46 int c = tolower(*term);
47 if (c < 'a' || c > 'z')
53 n->list[c].termno = num;
56 if (!n->list[c].child)
58 struct word_trie *new = create_word_trie_node(nmem);
59 n->list[c].child = new;
61 word_trie_addterm(nmem, n->list[c].child, term, num);
68 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
70 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
72 int c = raw_char(tolower(*word));
79 if (!*word || raw_char(*word) < 0)
81 if (t->list[c].termno > 0)
82 return t->list[c].termno;
90 return word_trie_match(t->list[c].child, word, skipped);
99 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
101 struct word_trie *res = create_word_trie_node(nmem);
105 for (i = 1, p = terms; *p; p++, i++)
106 word_trie_addterm(nmem, res, *p, i);
110 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
112 struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
116 for (p = terms, i = 0; *p; p++, i++)
119 res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
120 bzero(res->doc_frequency_vec, res->vec_len * sizeof(int));
122 res->wt = build_word_trie(nmem, terms);
126 void relevance_newrec(struct relevance *r, struct record *rec)
128 if (!rec->term_frequency_vec)
130 rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
131 bzero(rec->term_frequency_vec, r->vec_len * sizeof(int));
136 // FIXME. The definition of a word is crude here.. should support
137 // some form of localization mechanism?
138 void relevance_countwords(struct relevance *r, struct record *head,
139 const char *words, int multiplier)
146 while (*words && (c = raw_char(tolower(*words))) < 0)
151 if ((res = word_trie_match(r->wt, words, &skipped)))
154 head->term_frequency_vec[res] += multiplier;
158 while (*words && (c = raw_char(tolower(*words))) >= 0)
161 head->term_frequency_vec[0]++;
165 void relevance_donerecord(struct relevance *r, struct record *head)
169 for (i = 1; i < r->vec_len; i++)
170 if (head->term_frequency_vec[i] > 0)
171 r->doc_frequency_vec[i]++;
173 r->doc_frequency_vec[0]++;
177 static int comp(const void *p1, const void *p2)
180 struct record **r1 = (struct record **) p1;
181 struct record **r2 = (struct record **) p2;
182 res = (*r2)->relevance - (*r1)->relevance;
191 static int comp(const void *p1, const void *p2)
193 struct record **r1 = (struct record **) p1;
194 struct record **r2 = (struct record **) p2;
195 return (*r2)->relevance - (*r1)->relevance;
199 // Prepare for a relevance-sorted read of up to num entries
200 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
203 float *idfvec = xmalloc(rel->vec_len * sizeof(float));
205 // Calculate document frequency vector for each term.
206 for (i = 1; i < rel->vec_len; i++)
208 if (!rel->doc_frequency_vec[i])
211 idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]);
213 // Calculate relevance for each document
214 for (i = 0; i < reclist->num_records; i++)
217 struct record *rec = reclist->flatlist[i];
220 for (t = 1; t < rel->vec_len; t++)
223 if (!rec->term_frequency_vec[0])
225 termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
226 relevance += termfreq * idfvec[t];
228 rec->relevance = (int) (relevance * 100000);
230 qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
231 reclist->pointer = 0;
238 * indent-tabs-mode: nil
240 * vim: shiftwidth=4 tabstop=8 expandtab