2 * $Id: relevance.c,v 1.2 2006-12-20 22:18:33 adam Exp $
14 int *doc_frequency_vec;
20 // We use this data structure to recognize terms in input records,
21 // and map them to record term vectors for counting.
26 struct word_trie *child;
31 static struct word_trie *create_word_trie_node(NMEM nmem)
33 struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
35 for (i = 0; i < 26; i++)
37 res->list[i].child = 0;
38 res->list[i].termno = -1;
43 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
46 int c = tolower(*term);
47 if (c < 'a' || c > 'z')
53 n->list[c].termno = num;
56 if (!n->list[c].child)
58 struct word_trie *new = create_word_trie_node(nmem);
59 n->list[c].child = new;
61 word_trie_addterm(nmem, n->list[c].child, term, num);
68 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
70 static int word_trie_match(struct word_trie *t, const char *word, int len, int *skipped)
72 int c = raw_char(tolower(*word));
79 if (!len || raw_char(*word) < 0)
81 if (t->list[c].termno > 0)
82 return t->list[c].termno;
90 return word_trie_match(t->list[c].child, word, len, skipped);
99 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
101 struct word_trie *res = create_word_trie_node(nmem);
105 for (i = 1, p = terms; *p; p++, i++)
106 word_trie_addterm(nmem, res, *p, i);
110 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
112 struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
116 for (p = terms, i = 0; *p; p++, i++)
119 res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
120 bzero(res->doc_frequency_vec, res->vec_len * sizeof(int));
122 res->wt = build_word_trie(nmem, terms);
126 void relevance_newrec(struct relevance *r, struct record *rec)
128 if (!rec->term_frequency_vec)
130 rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
131 bzero(rec->term_frequency_vec, r->vec_len * sizeof(int));
136 // FIXME. The definition of a word is crude here.. should support
137 // some form of localization mechanism?
138 void relevance_countwords(struct relevance *r, struct record *head,
139 const char *words, int len, int multiplier)
146 while (len && (c = raw_char(tolower(*words))) < 0)
154 if ((res = word_trie_match(r->wt, words, len, &skipped)))
158 head->term_frequency_vec[res] += multiplier;
162 while (len && (c = raw_char(tolower(*words))) >= 0)
168 head->term_frequency_vec[0]++;
172 void relevance_donerecord(struct relevance *r, struct record *head)
176 for (i = 1; i < r->vec_len; i++)
177 if (head->term_frequency_vec[i] > 0)
178 r->doc_frequency_vec[i]++;
180 r->doc_frequency_vec[0]++;
184 static int comp(const void *p1, const void *p2)
187 struct record **r1 = (struct record **) p1;
188 struct record **r2 = (struct record **) p2;
189 res = (*r2)->relevance - (*r1)->relevance;
198 static int comp(const void *p1, const void *p2)
200 struct record **r1 = (struct record **) p1;
201 struct record **r2 = (struct record **) p2;
202 return (*r2)->relevance - (*r1)->relevance;
206 // Prepare for a relevance-sorted read of up to num entries
207 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
210 float *idfvec = xmalloc(rel->vec_len * sizeof(float));
212 // Calculate document frequency vector for each term.
213 for (i = 1; i < rel->vec_len; i++)
215 if (!rel->doc_frequency_vec[i])
218 idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]);
220 // Calculate relevance for each document
221 for (i = 0; i < reclist->num_records; i++)
224 struct record *rec = reclist->flatlist[i];
227 for (t = 1; t < rel->vec_len; t++)
230 if (!rec->term_frequency_vec[0])
232 termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
233 relevance += termfreq * idfvec[t];
235 rec->relevance = (int) (relevance * 100000);
237 qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
238 reclist->pointer = 0;
245 * indent-tabs-mode: nil
247 * vim: shiftwidth=4 tabstop=8 expandtab