1 /* This file is part of the Zebra server.
2 Copyright (C) 1995-2008 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 \brief indexes records and extract tokens for indexing and sorting
40 #include <yaz/snprintf.h>
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
50 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
51 zebra_rec_keys_t ins_keys,
53 zebra_rec_keys_t del_keys,
56 static void zebra_init_log_level(void)
58 if (!log_level_initialized)
60 log_level_initialized = 1;
62 log_level_extract = yaz_log_module_level("extract");
63 log_level_details = yaz_log_module_level("indexdetails");
67 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
68 int cmd, zebra_rec_keys_t skp);
69 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
70 static void extract_token_add(RecWord *p);
72 static void check_log_limit(ZebraHandle zh)
74 if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
76 yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
77 zh->m_file_verbose_limit);
81 static void logRecord(ZebraHandle zh)
84 ++zh->records_processed;
85 if (!(zh->records_processed % 1000))
87 yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
88 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
89 zh->records_processed, zh->records_inserted,
90 zh->records_updated, zh->records_deleted);
94 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
96 ctrl->flagShowRecords = !zh->m_flag_rw;
100 static void extract_add_index_string(RecWord *p,
101 zinfo_index_category_t cat,
102 const char *str, int length);
104 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
106 static void extract_init(struct recExtractCtrl *p, RecWord *w)
109 w->index_name = "any";
117 struct snip_rec_info {
119 zebra_snippets *snippets;
123 static void snippet_add_complete_field(RecWord *p, int ord,
126 struct snip_rec_info *h = p->extractCtrl->handle;
128 const char *b = p->term_buf;
129 char buf[IT_MAX_WORD+1];
130 const char **map = 0;
131 int i = 0, remain = p->term_len;
132 const char *start = b;
133 const char *last = 0;
136 map = zebra_maps_input(zm, &b, remain, 1);
138 while (remain > 0 && i < IT_MAX_WORD)
140 while (map && *map && **map == *CHR_SPACE)
142 remain = p->term_len - (b - p->term_buf);
145 start = b; /* set to first non-ws area */
148 int first = i ? 0 : 1; /* first position */
150 map = zebra_maps_input(zm, &b, remain, first);
158 if (i && i < IT_MAX_WORD)
159 buf[i++] = *CHR_SPACE;
160 while (map && *map && **map != *CHR_SPACE)
162 const char *cp = *map;
164 if (**map == *CHR_CUT)
170 if (i >= IT_MAX_WORD)
172 while (i < IT_MAX_WORD && *cp)
176 remain = p->term_len - (b - p->term_buf);
179 map = zebra_maps_input(zm, &b, remain, 0);
187 if (last && start != last && zebra_maps_is_index(zm))
188 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
189 start, last - start);
192 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
194 struct snip_rec_info *h = p->extractCtrl->handle;
195 const char *b = p->term_buf;
196 int remain = p->term_len;
198 const char **map = 0;
199 const char *start = b;
200 const char *last = b;
203 map = zebra_maps_input(zm, &b, remain, 0);
207 char buf[IT_MAX_WORD+1];
211 while (map && *map && **map == *CHR_SPACE)
213 remain = p->term_len - (b - p->term_buf);
216 map = zebra_maps_input(zm, &b, remain, 0);
222 if (start != last && zebra_maps_is_index(zm))
224 zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
225 start, last - start);
231 while (map && *map && **map != *CHR_SPACE)
233 const char *cp = *map;
235 while (i < IT_MAX_WORD && *cp)
237 remain = p->term_len - (b - p->term_buf);
240 map = zebra_maps_input(zm, &b, remain, 0);
250 if (zebra_maps_is_first_in_field(zm))
252 /* first in field marker */
256 if (start != last && zebra_maps_is_index(zm))
257 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
258 start, last - start);
265 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
267 struct snip_rec_info *h = p->extractCtrl->handle;
269 const char *res_buf = 0;
272 const char *display_buf = 0;
273 size_t display_len = 0;
275 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
276 while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
277 &display_buf, &display_len))
279 if (zebra_maps_is_index(zm))
280 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
281 display_buf, display_len);
286 static void snippet_token_add(RecWord *p)
288 struct snip_rec_info *h = p->extractCtrl->handle;
289 ZebraHandle zh = h->zh;
290 zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type);
294 ZebraExplainInfo zei = zh->reg->zei;
295 int ch = zebraExplain_lookup_attr_str(
296 zei, zinfo_index_category_index, p->index_type, p->index_name);
298 if (zebra_maps_is_icu(zm))
299 snippet_add_icu(p, ch, zm);
302 if (zebra_maps_is_complete(zm))
303 snippet_add_complete_field(p, ch, zm);
305 snippet_add_incomplete_field(p, ch, zm);
310 static void snippet_schema_add(
311 struct recExtractCtrl *p, Odr_oid *oid)
316 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
317 struct ZebraRecStream *stream,
318 RecType rt, void *recTypeClientData)
320 struct recExtractCtrl extractCtrl;
321 struct snip_rec_info info;
324 extractCtrl.stream = stream;
325 extractCtrl.first_record = 1;
326 extractCtrl.init = extract_init;
327 extractCtrl.tokenAdd = snippet_token_add;
328 extractCtrl.schemaAdd = snippet_schema_add;
332 extractCtrl.dh = zh->reg->dh;
336 extractCtrl.handle = &info;
337 extractCtrl.match_criteria[0] = '\0';
338 extractCtrl.staticrank = 0;
339 extractCtrl.action = action_insert;
341 init_extractCtrl(zh, &extractCtrl);
343 extractCtrl.setStoreData = 0;
345 r = (*rt->extract)(recTypeClientData, &extractCtrl);
349 static void searchRecordKey(ZebraHandle zh,
350 zebra_rec_keys_t reckeys,
351 const char *index_name,
352 const char **ws, int ws_length)
356 zinfo_index_category_t cat = zinfo_index_category_index;
358 for (i = 0; i<ws_length; i++)
362 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
364 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
366 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
371 if (zebra_rec_keys_rewind(reckeys))
378 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
380 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
382 seqno = key.mem[key.len-1];
384 if (key.mem[0] == ch)
390 woff = seqno - startSeq;
391 if (woff >= 0 && woff < ws_length)
398 #define FILE_MATCH_BLANK "\t "
400 static char *get_match_from_spec(ZebraHandle zh,
401 zebra_rec_keys_t reckeys,
402 const char *fname, const char *spec)
404 static char dstBuf[2048]; /* static here ??? */
406 const char *s = spec;
410 for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
417 char attset_str[64], attname_str[64];
421 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
423 for (i = 0; *s && *s != ',' && *s != ')' &&
424 !strchr(FILE_MATCH_BLANK, *s); s++)
425 if (i+1 < sizeof(attset_str))
426 attset_str[i++] = *s;
427 attset_str[i] = '\0';
429 for (; strchr(FILE_MATCH_BLANK, *s); s++)
432 strcpy(attname_str, attset_str);
435 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
437 for (i = 0; *s && *s != ')' &&
438 !strchr(FILE_MATCH_BLANK, *s); s++)
439 if (i+1 < sizeof(attname_str))
440 attname_str[i++] = *s;
441 attname_str[i] = '\0';
444 searchRecordKey(zh, reckeys, attname_str, ws, 32);
448 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
449 spec, zh->m_group ? zh->m_group : "none");
454 for (i = 0; i<32; i++)
463 dst += strlen(ws[i]);
467 yaz_log(YLOG_WARN, "Record didn't contain match"
468 " fields in (%s,%s)", attset_str, attname_str);
476 const char *spec_src = NULL;
477 const char *s1 = ++s;
478 while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
482 if (spec_len > sizeof(special)-1)
483 spec_len = sizeof(special)-1;
484 memcpy(special, s, spec_len);
485 special[spec_len] = '\0';
488 if (!strcmp(special, "group"))
489 spec_src = zh->m_group;
490 else if (!strcmp(special, "database"))
491 spec_src = zh->basenames[0];
492 else if (!strcmp(special, "filename")) {
495 else if (!strcmp(special, "type"))
496 spec_src = zh->m_record_type;
501 strcpy(dst, spec_src);
502 dst += strlen(spec_src);
505 else if (*s == '\"' || *s == '\'')
507 int stopMarker = *s++;
511 while (*s && *s != stopMarker)
513 if (i+1 < sizeof(tmpString))
514 tmpString[i++] = *s++;
519 strcpy(dst, tmpString);
520 dst += strlen(tmpString);
524 yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
525 spec, zh->m_group ? zh->m_group : "none");
532 yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
533 fname, zh->m_group ? zh->m_group : "none");
540 struct recordLogInfo {
543 struct recordGroup *rGroup;
546 /** \brief add the always-matches index entry and map to real record ID
547 \param ctrl record control
548 \param record_id custom record ID
549 \param sysno system record ID
551 This function serves two purposes.. It adds the always matches
552 entry and makes a pointer from the custom record ID (if defined)
553 back to the system record ID (sysno)
554 See zebra_recid_to_sysno .
556 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
560 extract_init(ctrl, &word);
561 word.record_id = record_id;
562 /* we use the seqno as placeholder for a way to get back to
563 record database from _ALLRECORDS.. This is used if a custom
564 RECORD was defined */
566 word.index_name = "_ALLRECORDS";
567 word.index_type = "w";
569 extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
573 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
574 struct ZebraRecStream *stream,
575 enum zebra_recctrl_action_t action,
577 const char *recordType,
579 const char *match_criteria,
582 void *recTypeClientData);
585 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
586 enum zebra_recctrl_action_t action)
588 ZEBRA_RES r = ZEBRA_OK;
593 struct file_read_info *fi = 0;
594 const char *original_record_type = 0;
596 void *recTypeClientData;
597 struct ZebraRecStream stream, *streamp;
599 zebra_init_log_level();
601 if (!zh->m_group || !*zh->m_group)
604 sprintf(gprefix, "%s.", zh->m_group);
606 yaz_log(log_level_extract, "zebra_extract_file %s", fname);
608 /* determine file extension */
610 for (i = strlen(fname); --i >= 0; )
613 else if (fname[i] == '.')
615 strcpy(ext, fname+i+1);
618 /* determine file type - depending on extension */
619 original_record_type = zh->m_record_type;
620 if (!zh->m_record_type)
622 sprintf(ext_res, "%srecordType.%s", gprefix, ext);
623 zh->m_record_type = res_get(zh->res, ext_res);
625 if (!zh->m_record_type)
628 if (zh->records_processed + zh->records_skipped
629 < zh->m_file_verbose_limit)
630 yaz_log(YLOG_LOG, "? %s", fname);
631 zh->records_skipped++;
634 /* determine match criteria */
635 if (!zh->m_record_id)
637 sprintf(ext_res, "%srecordId.%s", gprefix, ext);
638 zh->m_record_id = res_get(zh->res, ext_res);
642 recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
643 &recTypeClientData)))
645 yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
649 switch(recType->version)
654 yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
656 if (sysno && (action == action_delete || action == action_a_delete))
665 if (zh->path_reg && !yaz_is_abspath(fname))
667 strcpy(full_rep, zh->path_reg);
668 strcat(full_rep, "/");
669 strcat(full_rep, fname);
672 strcpy(full_rep, fname);
674 if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
676 yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
677 zh->m_record_type = original_record_type;
681 zebra_create_stream_fd(streamp, fd, 0);
683 r = zebra_extract_records_stream(zh, streamp,
688 0, /*match_criteria */
690 recType, recTypeClientData);
692 stream.destroy(streamp);
693 zh->m_record_type = original_record_type;
698 If sysno is provided, then it's used to identify the reocord.
699 If not, and match_criteria is provided, then sysno is guessed
700 If not, and a record is provided, then sysno is got from there
704 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
705 const char *buf, size_t buf_size,
706 enum zebra_recctrl_action_t action,
708 const char *recordType,
710 const char *match_criteria,
713 struct ZebraRecStream stream;
718 if (recordType && *recordType)
720 yaz_log(log_level_extract,
721 "Record type explicitly specified: %s", recordType);
722 recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
727 if (!(zh->m_record_type))
729 yaz_log(YLOG_WARN, "No such record type defined");
732 yaz_log(log_level_extract, "Get record type from rgroup: %s",
734 recType = recType_byName(zh->reg->recTypes, zh->res,
735 zh->m_record_type, &clientData);
736 recordType = zh->m_record_type;
741 yaz_log(YLOG_WARN, "No such record type: %s", recordType);
745 zebra_create_stream_mem(&stream, buf, buf_size);
747 res = zebra_extract_records_stream(zh, &stream,
754 recType, clientData);
755 stream.destroy(&stream);
759 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
760 struct ZebraRecStream *stream,
761 enum zebra_recctrl_action_t action,
763 const char *recordType,
765 const char *match_criteria,
768 void *recTypeClientData)
770 ZEBRA_RES res = ZEBRA_OK;
774 res = zebra_extract_record_stream(zh, stream,
781 recType, recTypeClientData, &more);
796 static WRBUF wrbuf_hex_str(const char *cstr)
799 WRBUF w = wrbuf_alloc();
800 for (i = 0; cstr[i]; i++)
802 if (cstr[i] < ' ' || cstr[i] > 126)
803 wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
805 wrbuf_putc(w, cstr[i]);
810 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
811 struct ZebraRecStream *stream,
812 enum zebra_recctrl_action_t action,
814 const char *recordType,
816 const char *match_criteria,
819 void *recTypeClientData,
824 RecordAttr *recordAttr;
825 struct recExtractCtrl extractCtrl;
827 const char *matchStr = 0;
829 off_t start_offset = 0, end_offset = 0;
830 const char *pr_fname = fname; /* filename to print .. */
831 int show_progress = zh->records_processed + zh->records_skipped
832 < zh->m_file_verbose_limit ? 1:0;
834 zebra_init_log_level();
837 pr_fname = "<no file>"; /* make it printable if file is omitted */
839 zebra_rec_keys_reset(zh->reg->keys);
840 zebra_rec_keys_reset(zh->reg->sortKeys);
842 if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
844 if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
845 zh->m_explain_database))
851 off_t null_offset = 0;
852 extractCtrl.stream = stream;
854 start_offset = stream->tellf(stream);
856 extractCtrl.first_record = start_offset ? 0 : 1;
858 stream->endf(stream, &null_offset);;
860 extractCtrl.init = extract_init;
861 extractCtrl.tokenAdd = extract_token_add;
862 extractCtrl.schemaAdd = extract_schema_add;
863 extractCtrl.dh = zh->reg->dh;
864 extractCtrl.handle = zh;
865 extractCtrl.match_criteria[0] = '\0';
866 extractCtrl.staticrank = 0;
867 extractCtrl.action = action;
869 init_extractCtrl(zh, &extractCtrl);
871 extract_set_store_data_prepare(&extractCtrl);
873 r = (*recType->extract)(recTypeClientData, &extractCtrl);
875 if (action == action_update)
877 action = extractCtrl.action;
882 case RECCTRL_EXTRACT_EOF:
884 case RECCTRL_EXTRACT_ERROR_GENERIC:
885 /* error occured during extraction ... */
886 yaz_log(YLOG_WARN, "extract error: generic");
888 case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
889 /* error occured during extraction ... */
890 yaz_log(YLOG_WARN, "extract error: no such filter");
892 case RECCTRL_EXTRACT_SKIP:
894 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
895 recordType, pr_fname, (zint) start_offset);
898 end_offset = stream->endf(stream, 0);
900 stream->seekf(stream, end_offset);
903 case RECCTRL_EXTRACT_OK:
906 yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
909 end_offset = stream->endf(stream, 0);
911 stream->seekf(stream, end_offset);
913 end_offset = stream->tellf(stream);
915 if (extractCtrl.match_criteria[0])
916 match_criteria = extractCtrl.match_criteria;
924 if (match_criteria && *match_criteria) {
925 matchStr = match_criteria;
927 if (zh->m_record_id && *zh->m_record_id) {
928 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
932 yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
933 pr_fname, (zint) start_offset);
940 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
941 char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
945 if (log_level_extract)
947 WRBUF w = wrbuf_hex_str(matchStr);
948 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
953 assert(*rinfo == sizeof(*sysno));
954 memcpy(sysno, rinfo+1, sizeof(*sysno));
959 if (zebra_rec_keys_empty(zh->reg->keys))
961 /* the extraction process returned no information - the record
962 is probably empty - unless flagShowRecords is in use */
969 /* new record AKA does not exist already */
970 if (action == action_delete)
972 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
973 pr_fname, (zint) start_offset);
974 yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
977 else if (action == action_a_delete)
980 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
981 pr_fname, (zint) start_offset);
984 else if (action == action_replace)
986 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
987 pr_fname, (zint) start_offset);
988 yaz_log(YLOG_WARN, "cannot update record above (seems new)");
992 yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
993 (zint) start_offset);
994 rec = rec_new(zh->reg->records);
1001 all_matches_add(&extractCtrl,
1002 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1007 recordAttr = rec_init_attr(zh->reg->zei, rec);
1008 if (extractCtrl.staticrank < 0)
1010 yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1011 extractCtrl.staticrank = 0;
1016 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1017 dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1018 sizeof(*sysno), sysno);
1021 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1023 extract_flush_record_keys2(zh, *sysno,
1024 zh->reg->keys, extractCtrl.staticrank,
1025 0, recordAttr->staticrank);
1027 extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1028 extractCtrl.staticrank);
1030 recordAttr->staticrank = extractCtrl.staticrank;
1031 zh->records_inserted++;
1035 /* record already exists */
1036 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1037 zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1038 if (action == action_insert)
1040 yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
1041 recordType, pr_fname, (zint) start_offset);
1046 rec = rec_get(zh->reg->records, *sysno);
1051 all_matches_add(&extractCtrl,
1052 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1056 recordAttr = rec_init_attr(zh->reg->zei, rec);
1058 /* decrease total size */
1059 zebraExplain_recordBytesIncrement(zh->reg->zei,
1060 - recordAttr->recordSize);
1062 zebra_rec_keys_set_buf(delkeys,
1063 rec->info[recInfo_delKeys],
1064 rec->size[recInfo_delKeys],
1066 zebra_rec_keys_set_buf(sortKeys,
1067 rec->info[recInfo_sortKeys],
1068 rec->size[recInfo_sortKeys],
1071 extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1073 extract_flush_record_keys(zh, *sysno, 0, delkeys,
1074 recordAttr->staticrank);
1076 if (action == action_delete || action == action_a_delete)
1078 /* record going to be deleted */
1080 extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1081 delkeys, recordAttr->staticrank);
1083 if (zebra_rec_keys_empty(delkeys))
1085 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086 pr_fname, (zint) start_offset);
1087 yaz_log(YLOG_WARN, "cannot delete file above, "
1088 "storeKeys false (3)");
1093 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1094 pr_fname, (zint) start_offset);
1095 zh->records_deleted++;
1098 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1099 dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1101 rec_del(zh->reg->records, &rec);
1103 zebra_rec_keys_close(delkeys);
1104 zebra_rec_keys_close(sortKeys);
1110 { /* update or special_update */
1112 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1113 pr_fname, (zint) start_offset);
1114 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1117 extract_flush_record_keys2(zh, *sysno,
1118 zh->reg->keys, extractCtrl.staticrank,
1119 delkeys, recordAttr->staticrank);
1121 extract_flush_record_keys(zh, *sysno, 1,
1122 zh->reg->keys, extractCtrl.staticrank);
1124 recordAttr->staticrank = extractCtrl.staticrank;
1125 zh->records_updated++;
1127 zebra_rec_keys_close(delkeys);
1128 zebra_rec_keys_close(sortKeys);
1130 /* update file type */
1131 xfree(rec->info[recInfo_fileType]);
1132 rec->info[recInfo_fileType] =
1133 rec_strdup(recordType, &rec->size[recInfo_fileType]);
1135 /* update filename */
1136 xfree(rec->info[recInfo_filename]);
1137 rec->info[recInfo_filename] =
1138 rec_strdup(fname, &rec->size[recInfo_filename]);
1140 /* update delete keys */
1141 xfree(rec->info[recInfo_delKeys]);
1142 if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1144 zebra_rec_keys_get_buf(zh->reg->keys,
1145 &rec->info[recInfo_delKeys],
1146 &rec->size[recInfo_delKeys]);
1150 rec->info[recInfo_delKeys] = NULL;
1151 rec->size[recInfo_delKeys] = 0;
1153 /* update sort keys */
1154 xfree(rec->info[recInfo_sortKeys]);
1156 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1157 &rec->info[recInfo_sortKeys],
1158 &rec->size[recInfo_sortKeys]);
1162 recordAttr->recordSize = end_offset - start_offset;
1163 zebraExplain_recordBytesIncrement(zh->reg->zei,
1164 recordAttr->recordSize);
1167 /* set run-number for this record */
1168 recordAttr->runNumber =
1169 zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1171 /* update store data */
1172 xfree(rec->info[recInfo_storeData]);
1174 /* update store data */
1175 if (zh->store_data_buf)
1177 rec->size[recInfo_storeData] = zh->store_data_size;
1178 rec->info[recInfo_storeData] = zh->store_data_buf;
1179 zh->store_data_buf = 0;
1180 recordAttr->recordSize = zh->store_data_size;
1182 else if (zh->m_store_data)
1184 off_t cur_offset = stream->tellf(stream);
1186 rec->size[recInfo_storeData] = recordAttr->recordSize;
1187 rec->info[recInfo_storeData] = (char *)
1188 xmalloc(recordAttr->recordSize);
1189 stream->seekf(stream, start_offset);
1190 stream->readf(stream, rec->info[recInfo_storeData],
1191 recordAttr->recordSize);
1192 stream->seekf(stream, cur_offset);
1196 rec->info[recInfo_storeData] = NULL;
1197 rec->size[recInfo_storeData] = 0;
1199 /* update database name */
1200 xfree(rec->info[recInfo_databaseName]);
1201 rec->info[recInfo_databaseName] =
1202 rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
1205 recordAttr->recordOffset = start_offset;
1207 /* commit this record */
1208 rec_put(zh->reg->records, &rec);
1213 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1215 ZebraHandle zh = (ZebraHandle) handle;
1216 struct recExtractCtrl extractCtrl;
1218 if (zebraExplain_curDatabase(zh->reg->zei,
1219 rec->info[recInfo_databaseName]))
1222 if (zebraExplain_newDatabase(zh->reg->zei,
1223 rec->info[recInfo_databaseName], 0))
1227 zebra_rec_keys_reset(zh->reg->keys);
1228 zebra_rec_keys_reset(zh->reg->sortKeys);
1230 extractCtrl.init = extract_init;
1231 extractCtrl.tokenAdd = extract_token_add;
1232 extractCtrl.schemaAdd = extract_schema_add;
1233 extractCtrl.dh = zh->reg->dh;
1235 init_extractCtrl(zh, &extractCtrl);
1237 extractCtrl.flagShowRecords = 0;
1238 extractCtrl.match_criteria[0] = '\0';
1239 extractCtrl.staticrank = 0;
1240 extractCtrl.action = action_update;
1242 extractCtrl.handle = handle;
1243 extractCtrl.first_record = 1;
1245 extract_set_store_data_prepare(&extractCtrl);
1248 grs_extract_tree(&extractCtrl, n);
1250 if (rec->size[recInfo_delKeys])
1252 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1254 zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1256 zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1257 rec->size[recInfo_delKeys],
1260 extract_flush_record_keys2(zh, rec->sysno,
1261 zh->reg->keys, 0, delkeys, 0);
1263 extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1264 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1266 zebra_rec_keys_close(delkeys);
1268 zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1269 rec->size[recInfo_sortKeys],
1272 extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1273 zebra_rec_keys_close(sortkeys);
1278 extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1280 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1283 extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1285 xfree(rec->info[recInfo_delKeys]);
1286 zebra_rec_keys_get_buf(zh->reg->keys,
1287 &rec->info[recInfo_delKeys],
1288 &rec->size[recInfo_delKeys]);
1290 xfree(rec->info[recInfo_sortKeys]);
1291 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1292 &rec->info[recInfo_sortKeys],
1293 &rec->size[recInfo_sortKeys]);
1297 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1298 zebra_rec_keys_t reckeys,
1301 if (zebra_rec_keys_rewind(reckeys))
1306 NMEM nmem = nmem_create();
1308 while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1310 char keystr[200]; /* room for zints to print */
1312 int ord = CAST_ZINT_TO_INT(key.mem[0]);
1313 const char *index_type;
1315 const char *string_index;
1317 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1318 0/* db */, &string_index);
1320 zebra_term_untrans_iconv(zh, nmem, index_type,
1323 for (i = 0; i<key.len; i++)
1325 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1328 if (*str < CHR_BASE_CHAR)
1331 char dst_buf[200]; /* room for special chars */
1333 strcpy(dst_buf , "?");
1335 if (!strcmp(str, ""))
1336 strcpy(dst_buf, "alwaysmatches");
1337 if (!strcmp(str, FIRST_IN_FIELD_STR))
1338 strcpy(dst_buf, "firstinfield");
1339 else if (!strcmp(str, CHR_UNKNOWN))
1340 strcpy(dst_buf, "unknown");
1341 else if (!strcmp(str, CHR_SPACE))
1342 strcpy(dst_buf, "space");
1344 for (i = 0; i<slen; i++)
1346 sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1348 yaz_log(level, "%s%s %s %s", keystr, index_type,
1349 string_index, dst_buf);
1353 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1354 string_index, dst_term);
1362 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1363 zebra_rec_keys_t reckeys)
1365 ZebraExplainInfo zei = zh->reg->zei;
1369 struct ord_stat *next;
1372 if (zebra_rec_keys_rewind(reckeys))
1374 struct ord_stat *ord_list = 0;
1378 struct it_key key_in;
1379 while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1381 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1383 for (p = ord_list; p ; p = p->next)
1391 p = xmalloc(sizeof(*p));
1402 struct ord_stat *p1 = p;
1405 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1407 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1414 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1415 zebra_rec_keys_t ins_keys, zint ins_rank,
1416 zebra_rec_keys_t del_keys, zint del_rank)
1418 ZebraExplainInfo zei = zh->reg->zei;
1422 if (!zh->reg->key_block)
1424 int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1425 const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1426 int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1427 zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1432 extract_rec_keys_adjust(zh, 1, ins_keys);
1434 zebraExplain_recordCountIncrement(zei, 1);
1435 zebra_rec_keys_rewind(ins_keys);
1439 extract_rec_keys_adjust(zh, 0, del_keys);
1441 zebraExplain_recordCountIncrement(zei, -1);
1442 zebra_rec_keys_rewind(del_keys);
1448 const char *del_str;
1449 struct it_key del_key_in;
1453 const char *ins_str;
1454 struct it_key ins_key_in;
1458 del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1461 ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1464 if (del && ins && ins_rank == del_rank
1465 && !key_compare(&del_key_in, &ins_key_in)
1466 && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1476 key_block_write(zh->reg->key_block, sysno,
1477 &del_key_in, 0, del_str, del_slen,
1478 del_rank, zh->m_staticrank);
1480 key_block_write(zh->reg->key_block, sysno,
1481 &ins_key_in, 1, ins_str, ins_slen,
1482 ins_rank, zh->m_staticrank);
1484 yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1488 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1489 zebra_rec_keys_t reckeys,
1490 zebra_snippets *snippets)
1492 NMEM nmem = nmem_create();
1493 if (zebra_rec_keys_rewind(reckeys))
1498 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1503 const char *index_type;
1505 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1506 seqno = key.mem[key.len-1];
1507 ord = CAST_ZINT_TO_INT(key.mem[0]);
1509 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1510 0/* db */, 0 /* string_index */);
1512 zebra_term_untrans_iconv(zh, nmem, index_type,
1514 zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1522 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1524 yaz_log(YLOG_LOG, "print_rec_keys");
1525 if (zebra_rec_keys_rewind(reckeys))
1530 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1532 char dst_buf[IT_MAX_WORD];
1534 const char *index_type;
1535 int ord = CAST_ZINT_TO_INT(key.mem[0]);
1537 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1539 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1541 seqno = key.mem[key.len-1];
1543 zebra_term_untrans(zh, index_type, dst_buf, str);
1545 yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
1546 " term=%s", ord, seqno, dst_buf);
1551 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1552 const char *str, int length)
1555 ZebraHandle zh = p->extractCtrl->handle;
1556 ZebraExplainInfo zei = zh->reg->zei;
1559 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1561 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1565 key.mem[i++] = p->record_id;
1566 key.mem[i++] = p->section_id;
1568 if (zh->m_segment_indexing)
1569 key.mem[i++] = p->segment;
1570 key.mem[i++] = p->seqno;
1573 zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1576 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1579 ZebraHandle zh = p->extractCtrl->handle;
1580 ZebraExplainInfo zei = zh->reg->zei;
1582 zinfo_index_category_t cat = zinfo_index_category_sort;
1584 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1586 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1589 key.mem[1] = p->record_id;
1591 zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1594 static void extract_add_staticrank_string(RecWord *p,
1595 const char *str, int length)
1598 struct recExtractCtrl *ctrl = p->extractCtrl;
1600 if (length > sizeof(valz)-1)
1601 length = sizeof(valz)-1;
1603 memcpy(valz, str, length);
1604 valz[length] = '\0';
1605 ctrl->staticrank = atozint(valz);
1608 static void extract_add_string(RecWord *p, zebra_map_t zm,
1609 const char *string, int length)
1615 if (log_level_details)
1618 WRBUF w = wrbuf_alloc();
1620 wrbuf_write_escaped(w, string, length);
1621 yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1624 if (zebra_maps_is_index(zm))
1626 extract_add_index_string(p, zinfo_index_category_index,
1628 if (zebra_maps_is_alwaysmatches(zm))
1631 memcpy(&word, p, sizeof(word));
1634 extract_add_index_string(
1635 &word, zinfo_index_category_alwaysmatches, "", 0);
1638 else if (zebra_maps_is_sort(zm))
1640 extract_add_sort_string(p, string, length);
1642 else if (zebra_maps_is_staticrank(zm))
1644 extract_add_staticrank_string(p, string, length);
1648 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1650 const char *b = p->term_buf;
1651 int remain = p->term_len;
1653 const char **map = 0;
1656 map = zebra_maps_input(zm, &b, remain, 0);
1660 char buf[IT_MAX_WORD+1];
1664 while (map && *map && **map == *CHR_SPACE)
1666 remain = p->term_len - (b - p->term_buf);
1668 map = zebra_maps_input(zm, &b, remain, 0);
1675 while (map && *map && **map != *CHR_SPACE)
1677 const char *cp = *map;
1679 while (i < IT_MAX_WORD && *cp)
1681 remain = p->term_len - (b - p->term_buf);
1683 map = zebra_maps_input(zm, &b, remain, 0);
1693 if (zebra_maps_is_first_in_field(zm))
1695 /* first in field marker */
1696 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1700 extract_add_string(p, zm, buf, i);
1705 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1707 const char *b = p->term_buf;
1708 char buf[IT_MAX_WORD+1];
1709 const char **map = 0;
1710 int i = 0, remain = p->term_len;
1713 map = zebra_maps_input(zm, &b, remain, 1);
1715 while (remain > 0 && i < IT_MAX_WORD)
1717 while (map && *map && **map == *CHR_SPACE)
1719 remain = p->term_len - (b - p->term_buf);
1723 int first = i ? 0 : 1; /* first position */
1724 map = zebra_maps_input(zm, &b, remain, first);
1732 if (i && i < IT_MAX_WORD)
1733 buf[i++] = *CHR_SPACE;
1734 while (map && *map && **map != *CHR_SPACE)
1736 const char *cp = *map;
1738 if (**map == *CHR_CUT)
1744 if (i >= IT_MAX_WORD)
1746 while (i < IT_MAX_WORD && *cp)
1749 remain = p->term_len - (b - p->term_buf);
1752 map = zebra_maps_input(zm, &b, remain, 0);
1760 extract_add_string(p, zm, buf, i);
1763 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1765 const char *res_buf = 0;
1768 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1769 while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1771 extract_add_string(p, zm, res_buf, res_len);
1777 /** \brief top-level indexing handler for recctrl system
1778 \param p token data to be indexed
1782 extract_add_{in}_complete / extract_add_icu
1785 extract_add_index_string
1787 extract_add_sort_string
1789 extract_add_staticrank_string
1792 static void extract_token_add(RecWord *p)
1794 ZebraHandle zh = p->extractCtrl->handle;
1795 zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1798 if (log_level_details)
1800 yaz_log(log_level_details, "extract_token_add "
1801 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1802 p->index_type, p->index_name,
1803 p->seqno, p->term_len, p->term_buf);
1805 if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1807 p->term_buf = wrbuf_buf(wrbuf);
1808 p->term_len = wrbuf_len(wrbuf);
1810 if (zebra_maps_is_icu(zm))
1812 extract_add_icu(p, zm);
1816 if (zebra_maps_is_complete(zm))
1817 extract_add_complete_field(p, zm);
1819 extract_add_incomplete_field(p, zm);
1823 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1824 void *buf, size_t sz)
1826 ZebraHandle zh = (ZebraHandle) p->handle;
1828 xfree(zh->store_data_buf);
1829 zh->store_data_buf = 0;
1830 zh->store_data_size = 0;
1833 zh->store_data_buf = xmalloc(sz);
1834 zh->store_data_size = sz;
1835 memcpy(zh->store_data_buf, buf, sz);
1839 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1841 ZebraHandle zh = (ZebraHandle) p->handle;
1842 xfree(zh->store_data_buf);
1843 zh->store_data_buf = 0;
1844 zh->store_data_size = 0;
1845 p->setStoreData = extract_set_store_data_cb;
1848 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1850 ZebraHandle zh = (ZebraHandle) p->handle;
1851 zebraExplain_addSchema(zh->reg->zei, oid);
1854 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1855 int cmd, zebra_rec_keys_t reckeys)
1858 yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1860 extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1863 if (zebra_rec_keys_rewind(reckeys))
1865 zebra_sort_index_t si = zh->reg->sort_index;
1868 struct it_key key_in;
1870 NMEM nmem = nmem_create();
1871 struct sort_add_ent {
1874 struct sort_add_ent *next;
1878 struct sort_add_ent *sort_ent_list = 0;
1880 while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1882 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1883 zint filter_sysno = key_in.mem[1];
1885 struct sort_add_ent **e = &sort_ent_list;
1886 while (*e && (*e)->ord != ord)
1890 *e = nmem_malloc(nmem, sizeof(**e));
1892 (*e)->wrbuf = wrbuf_alloc();
1895 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1898 wrbuf_write((*e)->wrbuf, str, slen);
1899 wrbuf_putc((*e)->wrbuf, '\0');
1903 zint last_sysno = 0;
1904 struct sort_add_ent *e = sort_ent_list;
1905 for (; e; e = e->next)
1907 if (last_sysno != e->sysno)
1909 zebra_sort_sysno(si, e->sysno);
1910 last_sysno = e->sysno;
1912 zebra_sort_type(si, e->ord);
1914 zebra_sort_add(si, e->wrbuf);
1916 zebra_sort_delete(si);
1917 wrbuf_destroy(e->wrbuf);
1927 * indent-tabs-mode: nil
1929 * vim: shiftwidth=4 tabstop=8 expandtab