1 /* $Id: extract.c,v 1.147 2004-01-22 11:27:21 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
37 #if _FILE_OFFSET_BITS == 64
38 #define PRINTF_OFF_T "%Ld"
40 #define PRINTF_OFF_T "%ld"
43 #define USE_SHELLSORT 0
46 static void shellsort(void *ar, int r, size_t s,
47 int (*cmp)(const void *a, const void *b))
52 static const int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
53 13776, 4592, 1968, 861, 336,
54 112, 48, 21, 7, 3, 1 };
55 for ( k = 0; k < 16; k++)
56 for (h = incs[k], i = h; i < r; i++)
60 while (j > h && (*cmp)(a + s*(j-h), v) > 0)
62 memcpy (a + s*j, a + s*(j-h), s);
70 static void logRecord (ZebraHandle zh)
72 ++zh->records_processed;
73 if (!(zh->records_processed % 1000))
75 logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d",
76 zh->records_processed, zh->records_inserted, zh->records_updated,
81 static void extract_init (struct recExtractCtrl *p, RecWord *w)
83 w->zebra_maps = p->zebra_maps;
85 w->attrSet = VAL_BIB1;
91 static const char **searchRecordKey (ZebraHandle zh,
92 struct recKeys *reckeys,
93 int attrSetS, int attrUseS)
95 static const char *ws[32];
107 for (i = 0; i<32; i++)
111 chS = zebraExplain_lookupSU (zh->reg->zei, attrSetS, attrUseS);
115 while (off < reckeys->buf_used)
118 const char *src = reckeys->buf + off;
126 memcpy (&ch, src, sizeof(ch));
132 memcpy (&attrSet, src, sizeof(attrSet));
133 src += sizeof(attrSet);
137 memcpy (&attrUse, src, sizeof(attrUse));
138 src += sizeof(attrUse);
145 seqno += ((lead>>2) & 15)-1;
148 memcpy (&seqno, src, sizeof(seqno));
149 src += sizeof(seqno);
155 attrUseS == attrUse && attrSetS == attrSet
164 woff = seqno - startSeq;
165 if (woff >= 0 && woff < 31)
169 off = src - reckeys->buf;
171 assert (off == reckeys->buf_used);
175 struct file_read_info {
176 off_t file_max; /* maximum offset so far */
177 off_t file_offset; /* current offset */
178 off_t file_moffset; /* offset of rec/rec boundary */
185 static struct file_read_info *file_read_start (int fd)
187 struct file_read_info *fi = (struct file_read_info *)
188 xmalloc (sizeof(*fi));
192 fi->file_moffset = 0;
198 static void file_read_stop (struct file_read_info *fi)
203 static off_t file_seek (void *handle, off_t offset)
205 struct file_read_info *p = (struct file_read_info *) handle;
206 p->file_offset = offset;
209 return lseek (p->fd, offset, SEEK_SET);
212 static off_t file_tell (void *handle)
214 struct file_read_info *p = (struct file_read_info *) handle;
215 return p->file_offset;
218 static int file_read (void *handle, char *buf, size_t count)
220 struct file_read_info *p = (struct file_read_info *) handle;
226 if (r > p->sdrmax - p->file_offset)
227 r = p->sdrmax - p->file_offset;
229 memcpy (buf, p->sdrbuf + p->file_offset, r);
232 r = read (fd, buf, count);
236 if (p->file_offset > p->file_max)
237 p->file_max = p->file_offset;
242 static void file_begin (void *handle)
244 struct file_read_info *p = (struct file_read_info *) handle;
246 p->file_offset = p->file_moffset;
247 if (!p->sdrbuf && p->file_moffset)
248 lseek (p->fd, p->file_moffset, SEEK_SET);
252 static void file_end (void *handle, off_t offset)
254 struct file_read_info *p = (struct file_read_info *) handle;
256 assert (p->file_more == 0);
258 p->file_moffset = offset;
261 static char *fileMatchStr (ZebraHandle zh,
262 struct recKeys *reckeys,
263 const char *fname, const char *spec)
265 static char dstBuf[2048]; /* static here ??? */
267 const char *s = spec;
268 static const char **w;
272 while (*s == ' ' || *s == '\t')
278 char attset_str[64], attname_str[64];
279 data1_attset *attset;
282 int attSet = 1, attUse = 1;
286 for (i = 0; *s && *s != ',' && *s != ')'; s++)
288 attset_str[i++] = *s;
289 attset_str[i] = '\0';
294 for (i = 0; *s && *s != ')'; s++)
296 attname_str[i++] = *s;
297 attname_str[i] = '\0';
300 if ((attset = data1_get_attset (zh->reg->dh, attset_str)))
303 attSet = attset->reference;
304 att = data1_getattbyname(zh->reg->dh, attset, attname_str);
308 attUse = atoi (attname_str);
310 w = searchRecordKey (zh, reckeys, attSet, attUse);
315 for (i = 0; i<32; i++)
320 logf (LOG_WARN, "Missing ) in match criteria %s in group %s",
321 spec, zh->m_group ? zh->m_group : "none");
326 for (i = 0; i<32; i++)
327 if (matchFlag[i] && w[i])
339 logf (LOG_WARN, "Record didn't contain match"
340 " fields in (%s,%s)", attset_str, attname_str);
348 const char *spec_src = NULL;
349 const char *s1 = ++s;
350 while (*s1 && *s1 != ' ' && *s1 != '\t')
356 memcpy (special, s, spec_len);
357 special[spec_len] = '\0';
360 if (!strcmp (special, "group"))
361 spec_src = zh->m_group;
362 else if (!strcmp (special, "database"))
363 spec_src = zh->basenames[0];
364 else if (!strcmp (special, "filename")) {
367 else if (!strcmp (special, "type"))
368 spec_src = zh->m_record_type;
373 strcpy (dst, spec_src);
374 dst += strlen (spec_src);
377 else if (*s == '\"' || *s == '\'')
379 int stopMarker = *s++;
383 while (*s && *s != stopMarker)
386 tmpString[i++] = *s++;
391 strcpy (dst, tmpString);
392 dst += strlen (tmpString);
396 logf (LOG_WARN, "Syntax error in match criteria %s in group %s",
397 spec, zh->m_group ? zh->m_group : "none");
404 logf (LOG_WARN, "No match criteria for record %s in group %s",
405 fname, zh->m_group ? zh->m_group : "none");
412 struct recordLogInfo {
415 struct recordGroup *rGroup;
418 static int file_extract_record(ZebraHandle zh,
419 SYSNO *sysno, const char *fname,
421 struct file_read_info *fi,
424 RecordAttr *recordAttr;
429 off_t recordOffset = 0;
435 recType_byName (zh->reg->recTypes, zh->m_record_type, subType,
438 logf (LOG_WARN, "No such record type: %s", zh->m_record_type);
442 /* announce database */
443 if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
445 if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0],
446 zh->m_explain_database))
452 struct recExtractCtrl extractCtrl;
454 /* we are going to read from a file, so prepare the extraction */
457 zh->reg->keys.buf_used = 0;
458 zh->reg->keys.prevAttrUse = -1;
459 zh->reg->keys.prevAttrSet = -1;
460 zh->reg->keys.prevSeqNo = 0;
461 zh->reg->sortKeys.buf_used = 0;
463 recordOffset = fi->file_moffset;
464 extractCtrl.offset = fi->file_moffset;
465 extractCtrl.readf = file_read;
466 extractCtrl.seekf = file_seek;
467 extractCtrl.tellf = file_tell;
468 extractCtrl.endf = file_end;
470 extractCtrl.subType = subType;
471 extractCtrl.init = extract_init;
472 extractCtrl.tokenAdd = extract_token_add;
473 extractCtrl.schemaAdd = extract_schema_add;
474 extractCtrl.dh = zh->reg->dh;
475 extractCtrl.handle = zh;
476 for (i = 0; i<256; i++)
478 if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
479 extractCtrl.seqno[i] = 1;
481 extractCtrl.seqno[i] = 0;
483 extractCtrl.zebra_maps = zh->reg->zebra_maps;
484 extractCtrl.flagShowRecords = !zh->m_flag_rw;
487 printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset);
491 sprintf (msg, "%s:" PRINTF_OFF_T , fname, recordOffset);
492 yaz_log_init_prefix2 (msg);
495 r = (*recType->extract)(clientData, &extractCtrl);
497 yaz_log_init_prefix2 (0);
498 if (r == RECCTRL_EXTRACT_EOF)
500 else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)
502 /* error occured during extraction ... */
504 zh->records_processed < zh->m_file_verbose_limit)
506 logf (LOG_WARN, "fail %s %s " PRINTF_OFF_T, zh->m_record_type,
507 fname, recordOffset);
511 else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)
513 /* error occured during extraction ... */
515 zh->records_processed < zh->m_file_verbose_limit)
517 logf (LOG_WARN, "no filter for %s %s "
518 PRINTF_OFF_T, zh->m_record_type,
519 fname, recordOffset);
523 if (zh->reg->keys.buf_used == 0)
525 /* the extraction process returned no information - the record
526 is probably empty - unless flagShowRecords is in use */
530 logf (LOG_WARN, "empty %s %s " PRINTF_OFF_T, zh->m_record_type,
531 fname, recordOffset);
536 /* perform match if sysno not known and if match criteria is specified */
543 if (zh->m_record_id && *zh->m_record_id)
547 matchStr = fileMatchStr (zh, &zh->reg->keys, fname,
551 rinfo = dict_lookup (zh->reg->matchDict, matchStr);
553 memcpy (sysno, rinfo+1, sizeof(*sysno));
557 logf (LOG_WARN, "Bad match criteria");
568 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T, zh->m_record_type,
569 fname, recordOffset);
570 logf (LOG_WARN, "cannot delete record above (seems new)");
573 if (zh->records_processed < zh->m_file_verbose_limit)
574 logf (LOG_LOG, "add %s %s " PRINTF_OFF_T, zh->m_record_type,
575 fname, recordOffset);
576 rec = rec_new (zh->reg->records);
580 recordAttr = rec_init_attr (zh->reg->zei, rec);
584 dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno);
586 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
587 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
589 zh->records_inserted++;
593 /* record already exists */
594 struct recKeys delkeys;
595 struct sortKeys sortKeys;
597 rec = rec_get (zh->reg->records, *sysno);
600 recordAttr = rec_init_attr (zh->reg->zei, rec);
602 if (!force_update && recordAttr->runNumber ==
603 zebraExplain_runNumberIncrement (zh->reg->zei, 0))
605 yaz_log (LOG_LOG, "run number = %d", recordAttr->runNumber);
606 yaz_log (LOG_LOG, "skipped %s %s " PRINTF_OFF_T,
607 zh->m_record_type, fname, recordOffset);
608 extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);
613 delkeys.buf_used = rec->size[recInfo_delKeys];
614 delkeys.buf = rec->info[recInfo_delKeys];
616 sortKeys.buf_used = rec->size[recInfo_sortKeys];
617 sortKeys.buf = rec->info[recInfo_sortKeys];
619 extract_flushSortKeys (zh, *sysno, 0, &sortKeys);
620 extract_flushRecordKeys (zh, *sysno, 0, &delkeys);
623 /* record going to be deleted */
624 if (!delkeys.buf_used)
626 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
627 zh->m_record_type, fname, recordOffset);
628 logf (LOG_WARN, "cannot delete file above, storeKeys false");
632 if (zh->records_processed < zh->m_file_verbose_limit)
633 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
634 zh->m_record_type, fname, recordOffset);
635 zh->records_deleted++;
637 dict_delete (zh->reg->matchDict, matchStr);
638 rec_del (zh->reg->records, &rec);
646 /* record going to be updated */
647 if (!delkeys.buf_used)
649 logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
650 zh->m_record_type, fname, recordOffset);
651 logf (LOG_WARN, "cannot update file above, storeKeys false");
655 if (zh->records_processed < zh->m_file_verbose_limit)
656 logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
657 zh->m_record_type, fname, recordOffset);
658 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
659 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
660 zh->records_updated++;
664 /* update file type */
665 xfree (rec->info[recInfo_fileType]);
666 rec->info[recInfo_fileType] =
667 rec_strdup (zh->m_record_type, &rec->size[recInfo_fileType]);
669 /* update filename */
670 xfree (rec->info[recInfo_filename]);
671 rec->info[recInfo_filename] =
672 rec_strdup (fname, &rec->size[recInfo_filename]);
674 /* update delete keys */
675 xfree (rec->info[recInfo_delKeys]);
676 if (zh->reg->keys.buf_used > 0 && zh->m_store_keys == 1)
678 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
679 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
680 zh->reg->keys.buf = NULL;
681 zh->reg->keys.buf_max = 0;
685 rec->info[recInfo_delKeys] = NULL;
686 rec->size[recInfo_delKeys] = 0;
689 /* update sort keys */
690 xfree (rec->info[recInfo_sortKeys]);
692 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
693 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
694 zh->reg->sortKeys.buf = NULL;
695 zh->reg->sortKeys.buf_max = 0;
697 /* save file size of original record */
698 zebraExplain_recordBytesIncrement (zh->reg->zei,
699 - recordAttr->recordSize);
700 recordAttr->recordSize = fi->file_moffset - recordOffset;
701 if (!recordAttr->recordSize)
702 recordAttr->recordSize = fi->file_max - recordOffset;
703 zebraExplain_recordBytesIncrement (zh->reg->zei,
704 recordAttr->recordSize);
706 /* set run-number for this record */
707 recordAttr->runNumber = zebraExplain_runNumberIncrement (zh->reg->zei,
710 /* update store data */
711 xfree (rec->info[recInfo_storeData]);
712 if (zh->m_store_data)
714 rec->size[recInfo_storeData] = recordAttr->recordSize;
715 rec->info[recInfo_storeData] = (char *)
716 xmalloc (recordAttr->recordSize);
717 if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)
719 logf (LOG_ERRNO|LOG_FATAL, "seek to " PRINTF_OFF_T " in %s",
720 recordOffset, fname);
723 if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)
724 < recordAttr->recordSize)
726 logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s",
727 recordAttr->recordSize, fname);
733 rec->info[recInfo_storeData] = NULL;
734 rec->size[recInfo_storeData] = 0;
736 /* update database name */
737 xfree (rec->info[recInfo_databaseName]);
738 rec->info[recInfo_databaseName] =
739 rec_strdup (zh->basenames[0], &rec->size[recInfo_databaseName]);
742 recordAttr->recordOffset = recordOffset;
744 /* commit this record */
745 rec_put (zh->reg->records, &rec);
750 int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname,
757 struct file_read_info *fi;
759 if (!zh->m_group || !*zh->m_group)
762 sprintf (gprefix, "%s.", zh->m_group);
764 logf (LOG_DEBUG, "fileExtract %s", fname);
766 /* determine file extension */
768 for (i = strlen(fname); --i >= 0; )
771 else if (fname[i] == '.')
773 strcpy (ext, fname+i+1);
776 /* determine file type - depending on extension */
777 if (!zh->m_record_type)
779 sprintf (ext_res, "%srecordType.%s", gprefix, ext);
780 zh->m_record_type = res_get (zh->res, ext_res);
782 if (!zh->m_record_type)
784 if (zh->records_processed < zh->m_file_verbose_limit)
785 logf (LOG_LOG, "? %s", fname);
788 /* determine match criteria */
789 if (!zh->m_record_id)
791 sprintf (ext_res, "%srecordId.%s", gprefix, ext);
792 zh->m_record_id = res_get (zh->res, ext_res);
795 if (sysno && deleteFlag)
801 if (zh->path_reg && !yaz_is_abspath (fname))
803 strcpy (full_rep, zh->path_reg);
804 strcat (full_rep, "/");
805 strcat (full_rep, fname);
808 strcpy (full_rep, fname);
811 if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1)
813 logf (LOG_WARN|LOG_ERRNO, "open %s", full_rep);
817 fi = file_read_start (fd);
821 r = file_extract_record (zh, sysno, fname, deleteFlag, fi, 1);
822 } while (r && !sysno && fi->file_more);
830 If sysno is provided, then it's used to identify the reocord.
831 If not, and match_criteria is provided, then sysno is guessed
832 If not, and a record is provided, then sysno is got from there
835 int buffer_extract_record (ZebraHandle zh,
836 const char *buf, size_t buf_size,
839 const char *recordType,
841 const char *match_criteria,
846 RecordAttr *recordAttr;
847 struct recExtractCtrl extractCtrl;
850 RecType recType = NULL;
854 long recordOffset = 0;
855 struct zebra_fetch_control fc;
856 const char *pr_fname = fname; /* filename to print .. */
859 pr_fname = "<no file>"; /* make it printable if file is omitted */
862 fc.record_int_buf = buf;
863 fc.record_int_len = buf_size;
864 fc.record_int_pos = 0;
866 fc.record_offset = 0;
868 extractCtrl.offset = 0;
869 extractCtrl.readf = zebra_record_int_read;
870 extractCtrl.seekf = zebra_record_int_seek;
871 extractCtrl.tellf = zebra_record_int_tell;
872 extractCtrl.endf = zebra_record_int_end;
873 extractCtrl.fh = &fc;
875 zh->reg->keys.buf_used = 0;
876 zh->reg->keys.prevAttrUse = -1;
877 zh->reg->keys.prevAttrSet = -1;
878 zh->reg->keys.prevSeqNo = 0;
879 zh->reg->sortKeys.buf_used = 0;
881 if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
883 if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0],
884 zh->m_explain_database))
888 if (recordType && *recordType) {
889 logf (LOG_DEBUG, "Record type explicitly specified: %s", recordType);
890 recType = recType_byName (zh->reg->recTypes, recordType, subType,
893 if (!(zh->m_record_type)) {
894 logf (LOG_WARN, "No such record type defined");
897 logf (LOG_DEBUG, "Get record type from rgroup: %s",zh->m_record_type);
898 recType = recType_byName (zh->reg->recTypes, zh->m_record_type, subType,
900 recordType = zh->m_record_type;
904 logf (LOG_WARN, "No such record type: %s", zh->m_record_type);
908 extractCtrl.subType = subType;
909 extractCtrl.init = extract_init;
910 extractCtrl.tokenAdd = extract_token_add;
911 extractCtrl.schemaAdd = extract_schema_add;
912 extractCtrl.dh = zh->reg->dh;
913 extractCtrl.handle = zh;
914 extractCtrl.zebra_maps = zh->reg->zebra_maps;
915 extractCtrl.flagShowRecords = 0;
916 for (i = 0; i<256; i++)
918 if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
919 extractCtrl.seqno[i] = 1;
921 extractCtrl.seqno[i] = 0;
924 r = (*recType->extract)(clientData, &extractCtrl);
926 if (r == RECCTRL_EXTRACT_EOF)
928 else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)
930 /* error occured during extraction ... */
931 yaz_log (LOG_WARN, "extract error: generic");
934 else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)
936 /* error occured during extraction ... */
937 yaz_log (LOG_WARN, "extract error: no such filter");
940 if (zh->reg->keys.buf_used == 0)
942 /* the extraction process returned no information - the record
943 is probably empty - unless flagShowRecords is in use */
946 logf (LOG_WARN, "No keys generated for record");
947 logf (LOG_WARN, " The file is probably empty");
955 if (match_criteria && *match_criteria) {
956 matchStr = (char *)match_criteria;
958 if (zh->m_record_id && *zh->m_record_id) {
959 matchStr = fileMatchStr (zh, &zh->reg->keys, pr_fname,
964 rinfo = dict_lookup (zh->reg->matchDict, matchStr);
966 memcpy (sysno, rinfo+1, sizeof(*sysno));
968 logf (LOG_WARN, "Bad match criteria (recordID)");
978 logf (LOG_LOG, "delete %s %s %ld", recordType,
979 pr_fname, (long) recordOffset);
980 logf (LOG_WARN, "cannot delete record above (seems new)");
983 logf (LOG_LOG, "add %s %s %ld", recordType, pr_fname,
984 (long) recordOffset);
985 rec = rec_new (zh->reg->records);
989 recordAttr = rec_init_attr (zh->reg->zei, rec);
993 dict_insert (zh->reg->matchDict, matchStr,
994 sizeof(*sysno), sysno);
996 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
997 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
999 zh->records_inserted++;
1003 /* record already exists */
1004 struct recKeys delkeys;
1005 struct sortKeys sortKeys;
1007 if (!allow_update) {
1008 logf (LOG_LOG, "skipped %s %s %ld",
1009 recordType, pr_fname, (long) recordOffset);
1014 rec = rec_get (zh->reg->records, *sysno);
1017 recordAttr = rec_init_attr (zh->reg->zei, rec);
1019 if (!force_update) {
1020 if (recordAttr->runNumber ==
1021 zebraExplain_runNumberIncrement (zh->reg->zei, 0))
1023 logf (LOG_LOG, "skipped %s %s %ld", recordType,
1024 pr_fname, (long) recordOffset);
1025 extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);
1032 delkeys.buf_used = rec->size[recInfo_delKeys];
1033 delkeys.buf = rec->info[recInfo_delKeys];
1035 sortKeys.buf_used = rec->size[recInfo_sortKeys];
1036 sortKeys.buf = rec->info[recInfo_sortKeys];
1038 extract_flushSortKeys (zh, *sysno, 0, &sortKeys);
1039 extract_flushRecordKeys (zh, *sysno, 0, &delkeys);
1042 /* record going to be deleted */
1043 if (!delkeys.buf_used)
1045 logf (LOG_LOG, "delete %s %s %ld", recordType,
1046 pr_fname, (long) recordOffset);
1047 logf (LOG_WARN, "cannot delete file above, storeKeys false");
1051 logf (LOG_LOG, "delete %s %s %ld", recordType,
1052 pr_fname, (long) recordOffset);
1053 zh->records_deleted++;
1055 dict_delete (zh->reg->matchDict, matchStr);
1056 rec_del (zh->reg->records, &rec);
1064 /* record going to be updated */
1065 if (!delkeys.buf_used)
1067 logf (LOG_LOG, "update %s %s %ld", recordType,
1068 pr_fname, (long) recordOffset);
1069 logf (LOG_WARN, "cannot update file above, storeKeys false");
1073 logf (LOG_LOG, "update %s %s %ld", recordType,
1074 pr_fname, (long) recordOffset);
1075 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
1076 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
1077 zh->records_updated++;
1081 /* update file type */
1082 xfree (rec->info[recInfo_fileType]);
1083 rec->info[recInfo_fileType] =
1084 rec_strdup (recordType, &rec->size[recInfo_fileType]);
1086 /* update filename */
1087 xfree (rec->info[recInfo_filename]);
1088 rec->info[recInfo_filename] =
1089 rec_strdup (fname, &rec->size[recInfo_filename]);
1091 /* update delete keys */
1092 xfree (rec->info[recInfo_delKeys]);
1093 if (zh->reg->keys.buf_used > 0 && zh->m_store_keys == 1)
1095 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
1096 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
1097 zh->reg->keys.buf = NULL;
1098 zh->reg->keys.buf_max = 0;
1102 rec->info[recInfo_delKeys] = NULL;
1103 rec->size[recInfo_delKeys] = 0;
1106 /* update sort keys */
1107 xfree (rec->info[recInfo_sortKeys]);
1109 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
1110 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
1111 zh->reg->sortKeys.buf = NULL;
1112 zh->reg->sortKeys.buf_max = 0;
1114 /* save file size of original record */
1115 zebraExplain_recordBytesIncrement (zh->reg->zei,
1116 - recordAttr->recordSize);
1118 recordAttr->recordSize = fi->file_moffset - recordOffset;
1119 if (!recordAttr->recordSize)
1120 recordAttr->recordSize = fi->file_max - recordOffset;
1122 recordAttr->recordSize = buf_size;
1124 zebraExplain_recordBytesIncrement (zh->reg->zei,
1125 recordAttr->recordSize);
1127 /* set run-number for this record */
1128 recordAttr->runNumber =
1129 zebraExplain_runNumberIncrement (zh->reg->zei, 0);
1131 /* update store data */
1132 xfree (rec->info[recInfo_storeData]);
1133 if (zh->m_store_data)
1135 rec->size[recInfo_storeData] = recordAttr->recordSize;
1136 rec->info[recInfo_storeData] = (char *)
1137 xmalloc (recordAttr->recordSize);
1138 memcpy (rec->info[recInfo_storeData], buf, recordAttr->recordSize);
1142 rec->info[recInfo_storeData] = NULL;
1143 rec->size[recInfo_storeData] = 0;
1145 /* update database name */
1146 xfree (rec->info[recInfo_databaseName]);
1147 rec->info[recInfo_databaseName] =
1148 rec_strdup (zh->basenames[0], &rec->size[recInfo_databaseName]);
1151 recordAttr->recordOffset = recordOffset;
1153 /* commit this record */
1154 rec_put (zh->reg->records, &rec);
1159 int explain_extract (void *handle, Record rec, data1_node *n)
1161 ZebraHandle zh = (ZebraHandle) handle;
1162 struct recExtractCtrl extractCtrl;
1165 if (zebraExplain_curDatabase (zh->reg->zei,
1166 rec->info[recInfo_databaseName]))
1169 if (zebraExplain_newDatabase (zh->reg->zei,
1170 rec->info[recInfo_databaseName], 0))
1174 zh->reg->keys.buf_used = 0;
1175 zh->reg->keys.prevAttrUse = -1;
1176 zh->reg->keys.prevAttrSet = -1;
1177 zh->reg->keys.prevSeqNo = 0;
1178 zh->reg->sortKeys.buf_used = 0;
1180 extractCtrl.init = extract_init;
1181 extractCtrl.tokenAdd = extract_token_add;
1182 extractCtrl.schemaAdd = extract_schema_add;
1183 extractCtrl.dh = zh->reg->dh;
1184 for (i = 0; i<256; i++)
1185 extractCtrl.seqno[i] = 0;
1186 extractCtrl.zebra_maps = zh->reg->zebra_maps;
1187 extractCtrl.flagShowRecords = 0;
1188 extractCtrl.handle = handle;
1191 grs_extract_tree(&extractCtrl, n);
1193 if (rec->size[recInfo_delKeys])
1195 struct recKeys delkeys;
1196 struct sortKeys sortkeys;
1198 delkeys.buf_used = rec->size[recInfo_delKeys];
1199 delkeys.buf = rec->info[recInfo_delKeys];
1201 sortkeys.buf_used = rec->size[recInfo_sortKeys];
1202 sortkeys.buf = rec->info[recInfo_sortKeys];
1204 extract_flushSortKeys (zh, rec->sysno, 0, &sortkeys);
1205 extract_flushRecordKeys (zh, rec->sysno, 0, &delkeys);
1207 extract_flushRecordKeys (zh, rec->sysno, 1, &zh->reg->keys);
1208 extract_flushSortKeys (zh, rec->sysno, 1, &zh->reg->sortKeys);
1210 xfree (rec->info[recInfo_delKeys]);
1211 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
1212 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
1213 zh->reg->keys.buf = NULL;
1214 zh->reg->keys.buf_max = 0;
1216 xfree (rec->info[recInfo_sortKeys]);
1217 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
1218 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
1219 zh->reg->sortKeys.buf = NULL;
1220 zh->reg->sortKeys.buf_max = 0;
1225 void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
1226 int cmd, struct recKeys *reckeys)
1230 unsigned char attrSet = (unsigned char) -1;
1231 unsigned short attrUse = (unsigned short) -1;
1236 ZebraExplainInfo zei = zh->reg->zei;
1238 if (!zh->reg->key_buf)
1240 int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8"));
1243 logf(LOG_WARN, "Invalid memory setting, using default 8 MB");
1246 /* FIXME: That "8" should be in a default settings include */
1247 /* not hard-coded here! -H */
1248 zh->reg->key_buf = (char**) xmalloc (mem);
1249 zh->reg->ptr_top = mem/sizeof(char*);
1251 zh->reg->key_buf_used = 0;
1252 zh->reg->key_file_no = 0;
1254 zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
1255 while (off < reckeys->buf_used)
1257 const char *src = reckeys->buf + off;
1266 memcpy (&ch, src, sizeof(ch));
1272 memcpy (&attrSet, src, sizeof(attrSet));
1273 src += sizeof(attrSet);
1277 memcpy (&attrUse, src, sizeof(attrUse));
1278 src += sizeof(attrUse);
1281 if (zh->reg->key_buf_used + 1024 >
1282 (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))
1283 extract_flushWriteKeys (zh);
1285 (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =
1286 (char*)zh->reg->key_buf + zh->reg->key_buf_used;
1289 ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
1291 ch = zebraExplain_addSU (zei, attrSet, attrUse);
1294 zh->reg->key_buf_used +=
1295 key_SU_encode (ch,((char*)zh->reg->key_buf) +
1296 zh->reg->key_buf_used);
1299 ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;
1301 ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';
1302 ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;
1305 seqno += ((lead>>2) & 15)-1;
1308 memcpy (&seqno, src, sizeof(seqno));
1309 src += sizeof(seqno);
1313 memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key, sizeof(key));
1314 (zh->reg->key_buf_used) += sizeof(key);
1315 off = src - reckeys->buf;
1317 assert (off == reckeys->buf_used);
1320 void extract_flushWriteKeys (ZebraHandle zh)
1323 char out_fname[200];
1325 struct encode_info encode_info;
1326 int ptr_i = zh->reg->ptr_i;
1330 if (!zh->reg->key_buf || ptr_i <= 0)
1333 (zh->reg->key_file_no)++;
1334 logf (LOG_LOG, "sorting section %d", (zh->reg->key_file_no));
1336 qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i,
1337 sizeof(char*), key_qsort_compare);
1338 extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no);
1340 if (!(outf = fopen (out_fname, "wb")))
1342 logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
1345 logf (LOG_LOG, "writing section %d", zh->reg->key_file_no);
1346 prevcp = cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];
1348 encode_key_init (&encode_info);
1349 encode_key_write (cp, &encode_info, outf);
1353 cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];
1354 if (strcmp (cp, prevcp))
1356 encode_key_flush ( &encode_info, outf);
1357 encode_key_init (&encode_info);
1358 encode_key_write (cp, &encode_info, outf);
1362 encode_key_write (cp + strlen(cp), &encode_info, outf);
1364 encode_key_flush ( &encode_info, outf);
1366 qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare);
1367 extract_get_fname_tmp (out_fname, key_file_no);
1369 if (!(outf = fopen (out_fname, "wb")))
1371 logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
1374 logf (LOG_LOG, "writing section %d", key_file_no);
1376 prevcp = key_buf[ptr_top-i];
1378 if (!--i || strcmp (prevcp, key_buf[ptr_top-i]))
1380 key_y_len = strlen(prevcp)+1;
1382 logf (LOG_LOG, "key_y_len: %2d %02x %02x %s",
1383 key_y_len, prevcp[0], prevcp[1], 2+prevcp);
1385 qsort (key_buf + ptr_top-ptr_i, ptr_i - i,
1386 sizeof(char*), key_y_compare);
1387 cp = key_buf[ptr_top-ptr_i];
1389 encode_key_init (&encode_info);
1390 encode_key_write (cp, &encode_info, outf);
1393 cp = key_buf[ptr_top-ptr_i];
1394 encode_key_write (cp+key_y_len, &encode_info, outf);
1396 encode_key_flush ( &encode_info, outf);
1399 prevcp = key_buf[ptr_top-ptr_i];
1404 logf (LOG_FATAL|LOG_ERRNO, "fclose %s", out_fname);
1407 logf (LOG_LOG, "finished section %d", zh->reg->key_file_no);
1409 zh->reg->key_buf_used = 0;
1412 void extract_add_index_string (RecWord *p, const char *string,
1416 unsigned char attrSet;
1417 unsigned short attrUse;
1420 int *pseqno = &p->seqno;
1421 ZebraHandle zh = p->extractCtrl->handle;
1422 ZebraExplainInfo zei = zh->reg->zei;
1423 struct recKeys *keys = &zh->reg->keys;
1425 if (keys->buf_used+1024 > keys->buf_max)
1429 b = (char *) xmalloc (keys->buf_max += 128000);
1430 if (keys->buf_used > 0)
1431 memcpy (b, keys->buf, keys->buf_used);
1435 dst = keys->buf + keys->buf_used;
1437 attrSet = p->attrSet;
1438 if (keys->buf_used > 0 && keys->prevAttrSet == attrSet)
1441 keys->prevAttrSet = attrSet;
1442 attrUse = p->attrUse;
1443 if (keys->buf_used > 0 && keys->prevAttrUse == attrUse)
1446 keys->prevAttrUse = attrUse;
1448 diff = 1 + *pseqno - keys->prevSeqNo;
1449 if (diff >= 1 && diff <= 15)
1450 lead |= (diff << 2);
1454 keys->prevSeqNo = *pseqno;
1461 int ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
1464 ch = zebraExplain_addSU (zei, attrSet, attrUse);
1465 yaz_log (LOG_DEBUG, "addSU set=%d use=%d SU=%d",
1466 attrSet, attrUse, ch);
1469 memcpy (dst, &ch, sizeof(ch));
1475 memcpy (dst, &attrSet, sizeof(attrSet));
1476 dst += sizeof(attrSet);
1480 memcpy (dst, &attrUse, sizeof(attrUse));
1481 dst += sizeof(attrUse);
1484 *dst++ = p->reg_type;
1485 memcpy (dst, string, length);
1491 memcpy (dst, pseqno, sizeof(*pseqno));
1492 dst += sizeof(*pseqno);
1494 keys->buf_used = dst - keys->buf;
1497 static void extract_add_sort_string (RecWord *p, const char *string,
1500 ZebraHandle zh = p->extractCtrl->handle;
1501 struct sortKeys *sk = &zh->reg->sortKeys;
1504 while (off < sk->buf_used)
1508 off += key_SU_decode(&set, sk->buf + off);
1509 off += key_SU_decode(&use, sk->buf + off);
1510 off += key_SU_decode(&slen, sk->buf + off);
1512 if (p->attrSet == set && p->attrUse == use)
1515 assert (off == sk->buf_used);
1517 if (sk->buf_used + IT_MAX_WORD > sk->buf_max)
1521 b = (char *) xmalloc (sk->buf_max += 128000);
1522 if (sk->buf_used > 0)
1523 memcpy (b, sk->buf, sk->buf_used);
1527 off += key_SU_encode(p->attrSet, sk->buf + off);
1528 off += key_SU_encode(p->attrUse, sk->buf + off);
1529 off += key_SU_encode(length, sk->buf + off);
1530 memcpy (sk->buf + off, string, length);
1531 sk->buf_used = off + length;
1534 void extract_add_string (RecWord *p, const char *string, int length)
1536 assert (length > 0);
1537 if (zebra_maps_is_sort (p->zebra_maps, p->reg_type))
1538 extract_add_sort_string (p, string, length);
1540 extract_add_index_string (p, string, length);
1543 static void extract_add_incomplete_field (RecWord *p)
1545 const char *b = p->string;
1546 int remain = p->length;
1547 const char **map = 0;
1550 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1554 char buf[IT_MAX_WORD+1];
1558 while (map && *map && **map == *CHR_SPACE)
1560 remain = p->length - (b - p->string);
1562 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1569 while (map && *map && **map != *CHR_SPACE)
1571 const char *cp = *map;
1573 while (i < IT_MAX_WORD && *cp)
1575 remain = p->length - (b - p->string);
1577 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1583 extract_add_string (p, buf, i);
1588 static void extract_add_complete_field (RecWord *p)
1590 const char *b = p->string;
1591 char buf[IT_MAX_WORD+1];
1592 const char **map = 0;
1593 int i = 0, remain = p->length;
1596 map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
1598 while (remain > 0 && i < IT_MAX_WORD)
1600 while (map && *map && **map == *CHR_SPACE)
1602 remain = p->length - (b - p->string);
1604 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1611 if (i && i < IT_MAX_WORD)
1612 buf[i++] = *CHR_SPACE;
1613 while (map && *map && **map != *CHR_SPACE)
1615 const char *cp = *map;
1617 if (i >= IT_MAX_WORD)
1619 while (i < IT_MAX_WORD && *cp)
1621 remain = p->length - (b - p->string);
1623 map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
1631 extract_add_string (p, buf, i);
1634 void extract_token_add (RecWord *p)
1638 yaz_log (LOG_LOG, "token_add "
1639 "reg_type=%c attrSet=%d attrUse=%d seqno=%d s=%.*s",
1640 p->reg_type, p->attrSet, p->attrUse, p->seqno, p->length,
1643 if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0,
1644 p->string, p->length)))
1646 p->string = wrbuf_buf(wrbuf);
1647 p->length = wrbuf_len(wrbuf);
1649 if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))
1650 extract_add_complete_field (p);
1652 extract_add_incomplete_field(p);
1655 void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid)
1657 ZebraHandle zh = (ZebraHandle) (p->handle);
1658 zebraExplain_addSchema (zh->reg->zei, oid);
1661 void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno,
1662 int cmd, struct sortKeys *sk)
1664 SortIdx sortIdx = zh->reg->sortIdx;
1667 sortIdx_sysno (sortIdx, sysno);
1669 while (off < sk->buf_used)
1673 off += key_SU_decode(&set, sk->buf + off);
1674 off += key_SU_decode(&use, sk->buf + off);
1675 off += key_SU_decode(&slen, sk->buf + off);
1677 sortIdx_type(sortIdx, use);
1679 sortIdx_add(sortIdx, sk->buf + off, slen);
1681 sortIdx_add(sortIdx, "", 1);
1686 void encode_key_init (struct encode_info *i)
1697 char *encode_key_int (int d, char *bp)
1701 else if (d <= 16383)
1703 *bp++ = 64 + (d>>8);
1706 else if (d <= 4194303)
1708 *bp++ = 128 + (d>>16);
1709 *bp++ = (d>>8) & 255;
1714 *bp++ = 192 + (d>>24);
1715 *bp++ = (d>>16) & 255;
1716 *bp++ = (d>>8) & 255;
1724 /* this is the old encode_key_write
1725 * may be deleted once we are confident that the new works
1728 void encode_key_write (char *k, struct encode_info *i, FILE *outf)
1733 while ((*bp++ = *k++))
1735 memcpy (&key, k+1, sizeof(struct it_key));
1736 bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
1737 if (i->sysno != key.sysno)
1739 i->sysno = key.sysno;
1742 else if (!i->seqno && !key.seqno && i->cmd == *k)
1744 bp = encode_key_int (key.seqno - i->seqno, bp);
1745 i->seqno = key.seqno;
1747 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1749 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1754 void encode_key_flush (struct encode_info *i, FILE *outf)
1755 { /* dummy routine */
1760 /* new encode_key_write
1761 * The idea is to buffer one more key, and compare them
1762 * If we are going to delete and insert the same key,
1763 * we may as well not bother. Should make a difference in
1764 * updates with small modifications (appending to a mbox)
1766 void encode_key_write (char *k, struct encode_info *i, FILE *outf)
1771 if (*k) /* first time for new key */
1774 while ((*bp++ = *k++))
1776 i->keylen= bp - i->buf -1;
1777 assert(i->keylen+1+sizeof(struct it_key) < ENCODE_BUFLEN);
1781 bp=i->buf + i->keylen;
1786 memcpy (&key, k+1, sizeof(struct it_key));
1787 if (0==i->prevsys) /* no previous filter, fill up */
1789 i->prevsys=key.sysno;
1790 i->prevseq=key.seqno;
1793 else if ( (i->prevsys==key.sysno) &&
1794 (i->prevseq==key.seqno) &&
1796 { /* same numbers, diff cmd, they cancel out */
1800 { /* different stuff, write previous, move buf */
1801 bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);
1802 if (i->sysno != i->prevsys)
1804 i->sysno = i->prevsys;
1807 else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)
1809 return; /* ??? Filters some sort of duplicates away */
1810 /* ??? Can this ever happen -H 15oct02 */
1812 bp = encode_key_int (i->prevseq - i->seqno, bp);
1813 i->seqno = i->prevseq;
1814 i->cmd = i->prevcmd;
1815 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1817 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1820 i->keylen=0; /* ok, it's written, forget it */
1821 i->prevsys=key.sysno;
1822 i->prevseq=key.seqno;
1827 void encode_key_flush (struct encode_info *i, FILE *outf)
1828 { /* flush the last key from i */
1829 char *bp =i->buf + i->keylen;
1832 return; /* nothing to flush */
1835 bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);
1836 if (i->sysno != i->prevsys)
1838 i->sysno = i->prevsys;
1841 else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)
1843 return; /* ??? Filters some sort of duplicates away */
1844 /* ??? Can this ever happen -H 15oct02 */
1846 bp = encode_key_int (i->prevseq - i->seqno, bp);
1847 i->seqno = i->prevseq;
1848 i->cmd = i->prevcmd;
1849 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1851 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1854 i->keylen=0; /* ok, it's written, forget it */
1855 i->prevsys=0; /* forget the values too */