1 /* $Id: benchindex1.c,v 1.1 2006-12-10 21:00:56 adam Exp $
2 Copyright (C) 1995-2006
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <yaz/options.h>
25 #include <sys/times.h>
36 #include <yaz/xmalloc.h>
37 #include <yaz/marcdisp.h>
39 #include <idzebra/isamb.h>
40 #include <idzebra/dict.h>
48 struct index_term *terms;
49 struct index_term **ar;
58 struct index_term *next;
61 struct index_block *index_block_new(int memory)
63 struct index_block *b = xmalloc(sizeof(*b));
65 b->current_max = memory * 1024 * 1024;
67 b->nmem = nmem_create();
72 void index_block_destroy(struct index_block **bp)
76 nmem_destroy((*bp)->nmem);
82 static int cmp_ar(const void *p1, const void *p2)
84 struct index_term *t1 = *(struct index_term **) p1;
85 struct index_term *t2 = *(struct index_term **) p2;
86 int d = strcmp(t1->term, t2->term);
90 if (t1->docid > t2->docid)
92 else if (t1->docid < t2->docid)
94 if (t1->seqno > t2->seqno)
96 else if (t1->seqno < t2->seqno)
102 int code_read(void *vp, char **dst, int *insertMode)
104 struct index_block *b = (struct index_block *)vp;
105 struct index_term *t;
108 if (b->current_entry >= b->no_entries)
111 t = b->ar[b->current_entry];
115 key.mem[0] = t->word_id;
116 key.mem[1] = t->docid;
117 key.mem[2] = t->seqno;
120 memcpy(*dst, &key, sizeof(key));
122 (*dst) += sizeof(key);
125 yaz_log(YLOG_LOG, "returning " ZINT_FORMAT " " ZINT_FORMAT "\n",
126 key.mem[0], key.mem[1]);
131 void index_block_flush(struct index_block *b, ISAMB isb, Dict dict,
134 struct index_term *t = b->terms;
137 int no_words = 0, no_new_words = 0;
138 const char *dict_info = 0;
143 struct tms tms1, tms2;
144 struct timeval start_time, end_time;
147 gettimeofday(&start_time, 0);
151 b->ar = xmalloc(sizeof(*b->ar) * b->no_entries);
152 for (i = 0; i < b->no_entries; i++, t = t->next)
159 qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar);
161 for (i = 0; i < b->no_entries; i++)
163 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n",
164 ar[i]->term, ar[i]->docid, ar[i]->seqno);
167 dict_info = dict_lookup(dict, "_w");
170 assert(*dict_info == sizeof(word_id_seq));
171 memcpy(&word_id_seq, dict_info+1, sizeof(word_id_seq));
174 dict_info = dict_lookup(dict, "_i");
177 assert(*dict_info == sizeof(isamc_p));
178 memcpy(&isamc_p, dict_info+1, sizeof(isamc_p));
181 for (i = 0; i < b->no_entries; i++)
183 if (i > 0 && strcmp(b->ar[i-1]->term, b->ar[i]->term) == 0)
184 b->ar[i]->word_id = b->ar[i-1]->word_id;
187 const char *dict_info = dict_lookup(dict, b->ar[i]->term);
190 memcpy(&b->ar[i]->word_id, dict_info+1, sizeof(int));
196 dict_insert(dict, b->ar[i]->term, sizeof(int), &word_id_seq);
197 b->ar[i]->word_id = word_id_seq;
202 dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq);
204 b->current_entry = 0;
210 isamc_i.clientData = b;
211 isamc_i.read_item = code_read;
213 isamb_merge (isb, &isamc_p, &isamc_i);
216 dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p);
219 yaz_log(YLOG_LOG, "Flushed %d postings, %d/%d words, %d records",
220 b->no_entries, no_words, no_new_words, no_docs);
230 gettimeofday(&end_time, 0);
233 usec = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
234 end_time.tv_usec - start_time.tv_usec;
236 printf("%3d %8.6f %5.2f %5.2f\n",
239 (double) (tms2.tms_utime - tms1.tms_utime)/100,
240 (double) (tms2.tms_stime - tms1.tms_stime)/100);
246 void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict,
249 int total = nmem_total(b->nmem);
250 int max = b->current_max;
253 yaz_log(YLOG_LOG, "flush to disk total=%d max=%d", total, max);
254 index_block_flush(b, isb, dict, no_docs);
258 void index_block_add(struct index_block *b,
259 const char *term, zint docid, zint seqno)
261 struct index_term *t = nmem_malloc(b->nmem, sizeof(*t));
262 t->term = nmem_strdup(b->nmem, term);
270 void exit_usage(void)
272 fprintf(stderr, "benchindex1 [-z sz]\n");
276 void index_term(struct index_block *b, const char *term,
277 zint docid, zint *seqno)
280 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n", term,
283 index_block_add(b, term, docid, *seqno);
287 void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid)
290 const char *cp = wrbuf_buf(wrbuf);
300 for (i = 0; i<6 && *cp; i++, cp++)
308 index_term(b, term, docid, &seqno);
314 else if (*cp == '$' && cp[1])
318 index_term(b, term, docid, &seqno);
323 else if (strchr("$/-;,.:[]\"&(){} ", *cp))
327 index_term(b, term, docid, &seqno);
334 unsigned ch = *(const unsigned char *)cp;
335 if (sz < sizeof(term))
337 term[sz] = tolower(ch);
345 index_term(b, term, docid, &seqno);
348 void index_marc_from_file(ISAMB isb,
352 int verbose, int print_offset)
354 yaz_marc_t mt = yaz_marc_create();
355 WRBUF wrbuf = wrbuf_alloc();
356 struct index_block *b = index_block_new(memory);
357 const char *dict_info = 0;
361 dict_info = dict_lookup(dict, "_s");
364 assert(*dict_info == sizeof(docid_seq));
365 memcpy(&docid_seq, dict_info+1, sizeof(docid_seq));
374 r = fread (buf, 1, 5, inf);
377 if (r && print_offset && verbose)
378 printf ("<!-- Extra %ld bytes at end of file -->\n",
382 while (*buf < '0' || *buf > '9')
385 long off = ftell(inf) - 5;
386 if (verbose || print_offset)
387 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
389 *buf & 0xff, *buf & 0xff,
391 for (i = 0; i<4; i++)
393 r = fread(buf+4, 1, 1, inf);
399 if (verbose || print_offset)
400 printf ("<!-- End of file with data -->\n");
403 len = atoi_n(buf, 5);
404 if (len < 25 || len > 100000)
406 long off = ftell(inf) - 5;
407 printf("Bad Length %ld read at offset %ld (%lx)\n",
408 (long)len, (long) off, (long) off);
412 r = fread (buf + 5, 1, rlen, inf);
415 yaz_marc_read_iso2709(mt, buf, len);
417 if (yaz_marc_write_line(mt, wrbuf))
420 index_wrbuf(b, wrbuf, docid_seq);
425 index_block_check_flush(b, isb, dict, no_docs);
427 index_block_flush(b, isb, dict, no_docs);
428 wrbuf_free(wrbuf, 1);
429 yaz_marc_destroy(mt);
430 index_block_destroy(&b);
431 yaz_log(YLOG_LOG, "Total " ZINT_FORMAT " documents", docid_seq);
432 dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq);
435 int main(int argc, char **argv)
445 const char *fname = 0;
448 while ((ret = options("im:", argv, argc, &arg)) != -2)
462 fprintf(stderr, "bad option.\n");
469 inf = fopen(fname, "rb");
472 fprintf(stderr, "Cannot open %s\n", fname);
476 /* setup method (attributes) */
477 method.compare_item = key_compare;
478 method.log_item = key_logdump_txt;
480 method.codec.start = iscz1_start;
481 method.codec.decode = iscz1_decode;
482 method.codec.encode = iscz1_encode;
483 method.codec.stop = iscz1_stop;
484 method.codec.reset = iscz1_reset;
488 /* create block system */
489 bfs = bfs_create(0, 0);
492 yaz_log(YLOG_WARN, "bfs_create failed");
499 /* create isam handle */
500 isb = isamb_open (bfs, "isamb", 1, &method, 0);
503 yaz_log(YLOG_WARN, "isamb_open failed");
506 dict = dict_open(bfs, "dict", 50, 1, 0, 4096);
508 index_marc_from_file(isb, dict, inf, memory,
509 0 /* verbose */ , 0 /* print_offset */);
516 /* exit block system */
524 * indent-tabs-mode: nil
526 * vim: shiftwidth=4 tabstop=8 expandtab