1 /* This file is part of Pazpar2.
2 Copyright (C) 2006-2009 Index Data
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <yaz/yaz-util.h>
28 #include "termlists.h"
29 #include "jenkins_hash.h"
32 // As terms are found in incoming records, they are added to (or updated in) a
33 // Hash table. When term records are updated, a frequency value is updated. At
34 // the same time, a highscore is maintained for the most frequent terms.
36 struct termlist_bucket
38 struct termlist_score term;
39 struct termlist_bucket *next;
44 struct termlist_bucket **hashtable;
47 struct termlist_score **highscore;
55 struct termlist *termlist_create(NMEM nmem, int highscore_size)
57 struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
60 nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
61 memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
64 res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
65 res->highscore_size = highscore_size;
66 res->highscore_num = 0;
67 res->highscore_min = 0;
72 static void update_highscore(struct termlist *tl, struct termlist_score *t)
78 if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
82 for (i = 0; i < tl->highscore_num; i++)
84 if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
86 if (tl->highscore[i] == t)
89 if (tl->highscore_num)
90 tl->highscore_min = tl->highscore[smallest]->frequency;
91 if (t->frequency < tl->highscore_min)
92 tl->highscore_min = t->frequency;
95 if (tl->highscore_num < tl->highscore_size)
97 tl->highscore[tl->highscore_num++] = t;
98 if (t->frequency < tl->highscore_min)
99 tl->highscore_min = t->frequency;
103 if (t->frequency > tl->highscore[smallest]->frequency)
105 tl->highscore[smallest] = t;
110 void termlist_insert(struct termlist *tl, const char *term)
113 struct termlist_bucket **p;
116 if (strlen(term) > 255)
120 for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
123 bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
124 for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
126 if (!strcmp(buf, (*p)->term.term))
128 (*p)->term.frequency++;
129 update_highscore(tl, &((*p)->term));
133 if (!*p) // We made it to the end of the bucket without finding match
135 struct termlist_bucket *new = nmem_malloc(tl->nmem,
136 sizeof(struct termlist_bucket));
137 new->term.term = nmem_strdup(tl->nmem, buf);
138 new->term.frequency = 1;
141 update_highscore(tl, &new->term);
145 static int compare(const void *s1, const void *s2)
147 struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
148 return (*p2)->frequency - (*p1)->frequency;
151 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
153 qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
154 *len = tl->highscore_num;
155 return tl->highscore;
161 * c-file-style: "Stroustrup"
162 * indent-tabs-mode: nil
164 * vim: shiftwidth=4 tabstop=8 expandtab