2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.21 1999-05-26 07:49:12 adam
10 * Revision 1.20 1999/05/15 14:36:37 adam
11 * Updated dictionary. Implemented "compression" of dictionary.
13 * Revision 1.19 1999/02/02 14:50:22 adam
14 * Updated WIN32 code specific sections. Changed header.
16 * Revision 1.18 1998/03/05 08:17:24 adam
17 * Added a few comments - no code changed.
19 * Revision 1.17 1996/05/14 15:49:09 adam
20 * Bug fix: In function split_page. In rare cases variable best_indxp was
23 * Revision 1.16 1996/02/02 13:43:50 adam
24 * The public functions simply use char instead of Dict_char to represent
25 * search strings. Dict_char is used internally only.
27 * Revision 1.15 1996/02/01 20:39:59 adam
28 * Bug fix: insert didn't work on 8-bit characters due to unsigned char
29 * compares in dict_strcmp (strcmp) and signed Dict_char. Dict_char is
32 * Revision 1.14 1995/12/07 11:48:56 adam
33 * Insert operation obeys DICT_type = 1 (slack in page).
34 * Function dict_open exists if page size or magic aren't right.
36 * Revision 1.13 1995/11/28 09:06:37 adam
37 * Fixed potential dangling pointer.
39 * Revision 1.12 1995/09/06 10:34:44 adam
40 * Memcpy in clean_page edited to satisfy checkergcc.
42 * Revision 1.11 1995/09/04 12:33:31 adam
43 * Various cleanup. YAZ util used instead.
45 * Revision 1.10 1994/10/05 12:16:48 adam
46 * Pagesize is a resource now.
48 * Revision 1.9 1994/09/16 15:39:13 adam
49 * Initial code of lookup - not tested yet.
51 * Revision 1.8 1994/09/16 12:35:01 adam
52 * New version of split_page which use clean_page for splitting.
54 * Revision 1.7 1994/09/12 08:06:42 adam
55 * Futher development of insert.c
57 * Revision 1.6 1994/09/06 13:05:15 adam
58 * Further development of insertion. Some special cases are
59 * not properly handled yet! assert(0) are put here. The
60 * binary search in each page definitely reduce usr CPU.
62 * Revision 1.5 1994/09/01 17:49:39 adam
63 * Removed stupid line. Work on insertion in dictionary. Not finished yet.
65 * Revision 1.4 1994/09/01 17:44:09 adam
66 * depend include change.
68 * Revision 1.3 1994/08/18 12:40:56 adam
69 * Some development of dictionary. Not finished at all!
71 * Revision 1.2 1994/08/17 13:32:19 adam
72 * Use cache in dict - not in bfile.
74 * Revision 1.1 1994/08/16 16:26:48 adam
88 static int dict_ins (Dict dict, const Dict_char *str,
89 Dict_ptr back_ptr, int userlen, void *userinfo);
90 static void clean_page (Dict dict, Dict_ptr ptr, void *p, Dict_char *out,
91 Dict_ptr subptr, char *userinfo);
94 static Dict_ptr new_page (Dict dict, Dict_ptr back_ptr, void **pp)
97 Dict_ptr ptr = dict->head.last;
98 if (!dict->head.freelist)
100 dict_bf_newp (dict->dbf, dict->head.last, &p, dict->head.page_size);
105 ptr = dict->head.freelist;
106 dict_bf_readp (dict->dbf, ptr, &p);
107 dict->head.freelist = DICT_backptr(p);
111 DICT_backptr(p) = back_ptr;
113 DICT_size(p) = DICT_infoffset;
114 DICT_bsize(p) = dict->head.page_size;
120 static int split_page (Dict dict, Dict_ptr ptr, void *p)
126 short *indxp, *best_indxp = NULL;
127 Dict_char best_char = 0;
128 Dict_char prev_char = 0;
129 int best_no = -1, no_current = 1;
131 /* determine splitting char... */
132 indxp = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
133 for (i = DICT_nodir (p); --i >= 0; --indxp)
135 if (*indxp > 0) /* tail string here! */
139 memcpy (&dc, (char*) p + *indxp, sizeof(dc));
141 { /* first entry met */
142 best_char = prev_char = dc;
146 else if (prev_char == dc)
147 { /* same char prefix. update */
148 if (++no_current > best_no)
149 { /* best entry so far */
150 best_no = no_current;
156 { /* new char prefix. restore */
162 if (best_no < 0) /* we didn't find any tail string entry at all! */
165 j = best_indxp - (short*) p;
166 subptr = new_page (dict, ptr, &subp);
167 /* scan entries to see if there is a string with */
168 /* length 1. info_here indicates if such entry exist */
170 for (i=0; i<best_no; i++, j++)
176 info = (char*) p + ((short*) p)[j];
178 memcpy (&dc, info, sizeof(dc));
179 assert (dc == best_char);
180 slen = 1+dict_strlen((Dict_char*) info);
186 info_here = info+slen*sizeof(Dict_char);
190 info1 = info+slen*sizeof(Dict_char); /* info start */
191 dict_ins (dict, (Dict_char*) (info+sizeof(Dict_char)),
192 subptr, *info1, info1+1);
193 dict_bf_readp (dict->dbf, ptr, &p);
196 /* now clean the page ... */
197 clean_page (dict, ptr, p, &best_char, subptr, info_here);
201 static void clean_page (Dict dict, Dict_ptr ptr, void *p, Dict_char *out,
202 Dict_ptr subptr, char *userinfo)
204 char *np = (char *) xmalloc (dict->head.page_size);
206 short *indxp1, *indxp2;
209 DICT_bsize(np) = dict->head.page_size;
210 indxp1 = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
211 indxp2 = (short*) ((char*) np+DICT_bsize(np));
212 info2 = (char*) np + DICT_infoffset;
213 for (i = DICT_nodir (p); --i >= 0; --indxp1)
215 if (*indxp1 > 0) /* tail string here! */
217 /* string (Dict_char *) DICT_EOS terminated */
218 /* unsigned char length of information */
219 /* char * information */
221 info1 = (char*) p + *indxp1;
222 if (out && memcmp (out, info1, sizeof(Dict_char)) == 0)
226 *--indxp2 = -(info2 - np);
227 memcpy (info2, &subptr, sizeof(Dict_ptr));
228 info2 += sizeof(Dict_ptr);
229 memcpy (info2, out, sizeof(Dict_char));
230 info2 += sizeof(Dict_char);
233 memcpy (info2, userinfo, *userinfo+1);
234 info2 += *userinfo + 1;
242 *--indxp2 = info2 - np;
243 slen = (dict_strlen((Dict_char*) info1)+1)*sizeof(Dict_char);
244 memcpy (info2, info1, slen);
250 /* Dict_ptr subptr */
251 /* Dict_char sub char */
252 /* unsigned char length of information */
253 /* char * information */
255 assert (*indxp1 < 0);
256 *--indxp2 = -(info2 - np);
257 info1 = (char*) p - *indxp1;
258 memcpy (info2, info1, sizeof(Dict_ptr)+sizeof(Dict_char));
259 info1 += sizeof(Dict_ptr)+sizeof(Dict_char);
260 info2 += sizeof(Dict_ptr)+sizeof(Dict_char);
263 memcpy (info2, info1, slen);
268 memcpy ((char*)p+DICT_infoffset,
269 (char*)np+DICT_infoffset,
270 info2 - ((char*)np+DICT_infoffset));
271 memcpy ((char*)p + ((char*)indxp2 - (char*)np),
273 ((char*) np+DICT_bsize(p)) - (char*)indxp2);
275 memcpy ((char*)p+DICT_infoffset, (char*)np+DICT_infoffset,
276 DICT_pagesize(dict)-DICT_infoffset);
278 DICT_size(p) = info2 - np;
282 dict_bf_touch (dict->dbf, ptr);
287 /* return 0 if new */
288 /* return 1 if before but change of info */
289 /* return 2 if same as before */
291 static int dict_ins (Dict dict, const Dict_char *str,
292 Dict_ptr ptr, int userlen, void *userinfo)
294 int hi, lo, mid, slen, cmp = 1;
299 dict_bf_readp (dict->dbf, ptr, &p);
305 hi = DICT_nodir(p)-1;
306 indxp = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
312 /* string (Dict_char *) DICT_EOS terminated */
313 /* unsigned char length of information */
314 /* char * information */
315 info = (char*)p + indxp[-mid];
316 cmp = dict_strcmp((Dict_char*) info, str);
319 info += (dict_strlen((Dict_char*) info)+1)*sizeof(Dict_char);
320 /* consider change of userinfo length... */
321 if (*info == userlen)
323 /* change of userinfo ? */
324 if (memcmp (info+1, userinfo, userlen))
326 dict_bf_touch (dict->dbf, ptr);
327 memcpy (info+1, userinfo, userlen);
333 else if (*info > userlen)
335 /* room for new userinfo */
338 dict_bf_touch (dict->dbf, ptr);
339 memcpy (info+1, userinfo, userlen);
350 /* Dict_ptr subptr */
351 /* Dict_char sub char */
352 /* unsigned char length of information */
353 /* char * information */
354 info = (char*)p - indxp[-mid];
355 memcpy (&dc, info+sizeof(Dict_ptr), sizeof(Dict_char));
359 memcpy (&subptr, info, sizeof(Dict_ptr));
360 if (*++str == DICT_EOS)
362 /* finish of string. Store userinfo here... */
364 int xlen = info[sizeof(Dict_ptr)+sizeof(Dict_char)];
367 if (memcmp (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
370 dict_bf_touch (dict->dbf, ptr);
371 memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
377 else if (xlen > userlen)
380 info[sizeof(Dict_ptr)+sizeof(Dict_char)] = userlen;
381 memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
383 dict_bf_touch (dict->dbf, ptr);
386 /* xlen < userlen, expanding needed ... */
387 if (DICT_size(p)+sizeof(Dict_char)+sizeof(Dict_ptr)+
389 DICT_bsize(p) - (1+DICT_nodir(p))*sizeof(short))
391 /* not enough room - split needed ... */
392 if (DICT_type(p) == 1)
394 clean_page (dict, ptr, p, NULL, 0, NULL);
395 return dict_ins (dict, str-1, ptr,
398 if (split_page (dict, ptr, p))
400 logf (LOG_FATAL, "Unable to split page %d\n", ptr);
403 return dict_ins (dict, str-1, ptr, userlen, userinfo);
406 { /* enough room - no split needed ... */
407 info = (char*)p + DICT_size(p);
408 memcpy (info, &subptr, sizeof(subptr));
409 memcpy (info+sizeof(Dict_ptr), &dc, sizeof(Dict_char));
410 info[sizeof(Dict_char)+sizeof(Dict_ptr)] = userlen;
411 memcpy (info+sizeof(Dict_char)+sizeof(Dict_ptr)+1,
413 indxp[-mid] = -DICT_size(p);
414 DICT_size(p) += sizeof(Dict_char)+sizeof(Dict_ptr)
417 dict_bf_touch (dict->dbf, ptr);
427 subptr = new_page (dict, ptr, NULL);
428 memcpy (info, &subptr, sizeof(subptr));
429 dict_bf_touch (dict->dbf, ptr);
431 return dict_ins (dict, str, subptr, userlen, userinfo);
441 if (lo>hi && cmp < 0)
443 slen = (dict_strlen(str)+1)*sizeof(Dict_char);
444 if (DICT_size(p)+slen+userlen >=
445 (int)(DICT_bsize(p) - (1+DICT_nodir(p))*sizeof(short)))/* overflow? */
449 clean_page (dict, ptr, p, NULL, 0, NULL);
450 return dict_ins (dict, str, ptr, userlen, userinfo);
452 split_page (dict, ptr, p);
453 return dict_ins (dict, str, ptr, userlen, userinfo);
459 indxp1 = (short*)((char*) p + DICT_bsize(p)
460 - DICT_nodir(p)*sizeof(short));
461 for (; indxp1 != indxp; indxp1++)
462 indxp1[0] = indxp1[1];
464 indxp1 = (short*) ((char*) p+DICT_pagesize(dict)-sizeof(short));
465 for (i = DICT_nodir (p); --i >= 0; --indxp1)
469 info = (char*)p - *indxp1;
470 assert (info[sizeof(Dict_ptr)] > ' ');
477 info = (char*)p + DICT_size(p);
478 memcpy (info, str, slen);
481 memcpy (info, userinfo, userlen);
484 *indxp = DICT_size(p);
485 DICT_size(p) = info- (char*) p;
486 dict_bf_touch (dict->dbf, ptr);
492 int dict_insert (Dict dict, const char *str, int userlen, void *userinfo)
494 if (!dict->head.root)
498 dict->head.root = new_page (dict, 0, &p);
499 if (!dict->head.root)
502 return dict_ins (dict, (const Dict_char *) str, dict->head.root,