1 /* $Id: regxread.c,v 1.52 2004-08-15 17:22:45 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
29 #include <yaz/tpath.h>
37 #if MAJOR_VERSION >= 8
38 #define HAVE_TCL_OBJECTS
44 #define F_WIN_EOF 2000000000
48 #define REGX_PATTERN 1
53 #define REGX_CONTEXT 6
63 struct lexRuleAction {
67 struct DFA *dfa; /* REGX_PATTERN */
70 struct regxCode *code; /* REGX_CODE */
72 struct lexRuleAction *next;
77 struct lexRuleAction *actionList;
81 struct lexRuleInfo info;
88 struct lexRule *rules;
89 struct lexRuleInfo **fastRule;
93 struct lexRuleAction *beginActionList;
94 struct lexRuleAction *endActionList;
95 struct lexRuleAction *initActionList;
96 struct lexContext *next;
106 struct lexContext *context;
108 struct lexContext **context_stack;
109 int context_stack_size;
110 int context_stack_top;
116 Tcl_Interp *tcl_interp;
119 void (*f_win_ef)(void *, off_t);
121 int f_win_start; /* first byte of buffer is this file offset */
122 int f_win_end; /* last byte of buffer is this offset - 1 */
123 int f_win_size; /* size of buffer */
124 char *f_win_buf; /* buffer itself */
125 int (*f_win_rf)(void *, char *, size_t);
126 off_t (*f_win_sf)(void *, off_t);
128 struct lexConcatBuf *concatBuf;
130 data1_node **d1_stack;
141 struct lexSpec *spec;
144 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
147 int i, r, off = start_pos - spec->f_win_start;
149 if (off >= 0 && end_pos <= spec->f_win_end)
151 *size = end_pos - start_pos;
152 return spec->f_win_buf + off;
154 if (off < 0 || start_pos >= spec->f_win_end)
156 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
157 spec->f_win_start = start_pos;
159 if (!spec->f_win_buf)
160 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
161 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
163 spec->f_win_end = spec->f_win_start + *size;
165 if (*size > end_pos - start_pos)
166 *size = end_pos - start_pos;
167 return spec->f_win_buf;
169 for (i = 0; i<spec->f_win_end - start_pos; i++)
170 spec->f_win_buf[i] = spec->f_win_buf[i + off];
171 r = (*spec->f_win_rf)(spec->f_win_fh,
173 spec->f_win_size - i);
174 spec->f_win_start = start_pos;
175 spec->f_win_end += r;
177 if (*size > end_pos - start_pos)
178 *size = end_pos - start_pos;
179 return spec->f_win_buf;
182 static int f_win_advance (struct lexSpec *spec, int *pos)
187 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
188 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
189 if (*pos == F_WIN_EOF)
191 buf = f_win_get (spec, *pos, *pos+1, &size);
201 static void regxCodeDel (struct regxCode **pp)
203 struct regxCode *p = *pp;
208 Tcl_DecrRefCount (p->tcl_obj);
216 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
220 p = (struct regxCode *) xmalloc (sizeof(*p));
221 p->str = (char *) xmalloc (len+1);
222 memcpy (p->str, buf, len);
225 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
227 Tcl_IncrRefCount (p->tcl_obj);
232 static struct DFA *lexSpecDFA (void)
237 dfa_parse_cmap_del (dfa, ' ');
238 dfa_parse_cmap_del (dfa, '\t');
239 dfa_parse_cmap_add (dfa, '/', 0);
243 static void actionListDel (struct lexRuleAction **rap)
245 struct lexRuleAction *ra1, *ra;
247 for (ra = *rap; ra; ra = ra1)
253 dfa_delete (&ra->u.pattern.dfa);
256 regxCodeDel (&ra->u.code);
264 static struct lexContext *lexContextCreate (const char *name)
266 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
268 p->name = xstrdup (name);
271 p->dfa = lexSpecDFA ();
274 p->beginActionList = NULL;
275 p->endActionList = NULL;
276 p->initActionList = NULL;
281 static void lexContextDestroy (struct lexContext *p)
283 struct lexRule *rp, *rp1;
285 dfa_delete (&p->dfa);
287 for (rp = p->rules; rp; rp = rp1)
290 actionListDel (&rp->info.actionList);
293 actionListDel (&p->beginActionList);
294 actionListDel (&p->endActionList);
295 actionListDel (&p->initActionList);
300 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
305 p = (struct lexSpec *) xmalloc (sizeof(*p));
306 p->name = (char *) xmalloc (strlen(name)+1);
307 strcpy (p->name, name);
314 p->context_stack_size = 100;
315 p->context_stack = (struct lexContext **)
316 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
320 p->concatBuf = (struct lexConcatBuf *)
321 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
322 for (i = 0; i < p->maxLevel; i++)
324 p->concatBuf[i].max = 0;
325 p->concatBuf[i].buf = 0;
327 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
332 static void lexSpecDestroy (struct lexSpec **pp)
335 struct lexContext *lt;
343 for (i = 0; i < p->maxLevel; i++)
344 xfree (p->concatBuf[i].buf);
345 xfree (p->concatBuf);
350 struct lexContext *lt_next = lt->next;
351 lexContextDestroy (lt);
356 Tcl_DeleteInterp (p->tcl_interp);
359 xfree (p->f_win_buf);
360 xfree (p->context_stack);
366 static int readParseToken (const char **cpp, int *len)
368 const char *cp = *cpp;
372 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
401 if (*cp >= 'a' && *cp <= 'z')
403 else if (*cp >= 'A' && *cp <= 'Z')
404 cmd[i] = *cp + 'a' - 'A';
407 if (i < (int) sizeof(cmd)-2)
414 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
416 while (*cp && *cp != ' ' && *cp != '\t' &&
417 *cp != '\n' && *cp != '\r')
423 if (!strcmp (cmd, "begin"))
425 else if (!strcmp (cmd, "end"))
427 else if (!strcmp (cmd, "body"))
429 else if (!strcmp (cmd, "context"))
431 else if (!strcmp (cmd, "init"))
435 logf (LOG_WARN, "bad command %s", cmd);
441 static int actionListMk (struct lexSpec *spec, const char *s,
442 struct lexRuleAction **ap)
448 while ((tok = readParseToken (&s, &len)))
456 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
458 regxCodeMk (&(*ap)->u.code, s, len);
462 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
464 (*ap)->u.pattern.body = bodyMark;
466 (*ap)->u.pattern.dfa = lexSpecDFA ();
468 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
473 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
477 printf ("pattern: %.*s\n", s-s0, s0);
478 dfa_mkstate ((*ap)->u.pattern.dfa);
482 logf (LOG_WARN, "cannot use BEGIN here");
485 logf (LOG_WARN, "cannot use INIT here");
488 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
498 int readOneSpec (struct lexSpec *spec, const char *s)
502 struct lexContext *lc;
504 tok = readParseToken (&s, &len);
505 if (tok == REGX_CONTEXT)
507 char context_name[32];
508 tok = readParseToken (&s, &len);
509 if (tok != REGX_CODE)
511 logf (LOG_WARN, "missing name after CONTEXT keyword");
516 memcpy (context_name, s, len);
517 context_name[len] = '\0';
518 lc = lexContextCreate (context_name);
519 lc->next = spec->context;
524 spec->context = lexContextCreate ("main");
529 actionListDel (&spec->context->beginActionList);
530 actionListMk (spec, s, &spec->context->beginActionList);
533 actionListDel (&spec->context->endActionList);
534 actionListMk (spec, s, &spec->context->endActionList);
537 actionListDel (&spec->context->initActionList);
538 actionListMk (spec, s, &spec->context->initActionList);
542 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
544 r = dfa_parse (spec->context->dfa, &s);
547 logf (LOG_WARN, "regular expression error. r=%d", r);
552 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
556 rp = (struct lexRule *) xmalloc (sizeof(*rp));
557 rp->info.no = spec->context->ruleNo++;
558 rp->next = spec->context->rules;
559 spec->context->rules = rp;
560 actionListMk (spec, s, &rp->info.actionList);
565 int readFileSpec (struct lexSpec *spec)
567 struct lexContext *lc;
568 int c, i, errors = 0;
574 if (spec->tcl_interp)
576 sprintf (fname, "%s.tflt", spec->name);
577 spec_inf = data1_path_fopen (spec->dh, fname, "r");
582 sprintf (fname, "%s.flt", spec->name);
583 spec_inf = data1_path_fopen (spec->dh, fname, "r");
587 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
590 logf (LOG_LOG, "reading regx filter %s", fname);
592 if (spec->tcl_interp)
593 logf (LOG_LOG, "Tcl enabled");
599 debug_dfa_followpos = 0;
603 lineBuf = wrbuf_alloc();
608 wrbuf_rewind (lineBuf);
609 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
611 while (c != '\n' && c != EOF)
624 wrbuf_putc(lineBuf, c);
632 if (c != ' ' && c != '\t')
637 wrbuf_putc(lineBuf, '\0');
638 readOneSpec (spec, wrbuf_buf(lineBuf));
639 spec->lineNo += addLine;
643 wrbuf_free(lineBuf, 1);
645 for (lc = spec->context; lc; lc = lc->next)
648 lc->fastRule = (struct lexRuleInfo **)
649 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
650 for (i = 0; i < lc->ruleNo; i++)
651 lc->fastRule[i] = NULL;
652 for (rp = lc->rules; rp; rp = rp->next)
653 lc->fastRule[rp->info.no] = &rp->info;
654 dfa_mkstate (lc->dfa);
663 static struct lexSpec *curLexSpec = NULL;
666 static void execData (struct lexSpec *spec,
667 const char *ebuf, int elen, int formatted_text,
668 const char *attribute_str, int attribute_len)
670 struct data1_node *res, *parent;
673 if (elen == 0) /* shouldn't happen, but it does! */
677 logf (LOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
678 ebuf, 40, ebuf + elen-40);
679 else if (elen == 1 && ebuf[0] == '\n')
681 logf (LOG_LOG, "data(new line)");
684 logf (LOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
686 logf (LOG_LOG, "data(%d bytes)", elen);
689 if (spec->d1_level <= 1)
692 parent = spec->d1_stack[spec->d1_level -1];
699 if (res->which != DATA1N_tag)
701 /* sweep through exising attributes.. */
702 for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
703 if (strlen((*ap)->name) == attribute_len &&
704 !memcmp((*ap)->name, attribute_str, attribute_len))
708 /* new attribute. Create it with name + value */
709 *ap = nmem_malloc(spec->m, sizeof(**ap));
711 (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
712 memcpy((*ap)->name, attribute_str, attribute_len);
713 (*ap)->name[attribute_len] = '\0';
715 (*ap)->value = nmem_malloc(spec->m, elen+1);
716 memcpy((*ap)->value, ebuf, elen);
717 (*ap)->value[elen] = '\0';
722 /* append to value if attribute already exists */
723 char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
724 strcpy(nv, (*ap)->value);
725 memcpy (nv + strlen(nv), ebuf, elen);
726 nv[strlen(nv)+elen] = '\0';
732 if ((res = spec->d1_stack[spec->d1_level]) &&
733 res->which == DATA1N_data)
734 org_len = res->u.data.len;
739 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
740 res->u.data.what = DATA1I_text;
742 res->u.data.formatted_text = formatted_text;
743 res->u.data.data = 0;
745 if (spec->d1_stack[spec->d1_level])
746 spec->d1_stack[spec->d1_level]->next = res;
747 spec->d1_stack[spec->d1_level] = res;
749 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
751 char *old_buf, *new_buf;
753 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
754 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
755 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
757 memcpy (new_buf, old_buf, org_len);
760 spec->concatBuf[spec->d1_level].buf = new_buf;
762 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
763 res->u.data.len += elen;
767 static void execDataP (struct lexSpec *spec,
768 const char *ebuf, int elen, int formatted_text)
770 execData (spec, ebuf, elen, formatted_text, 0, 0);
773 static void tagDataRelease (struct lexSpec *spec)
777 if ((res = spec->d1_stack[spec->d1_level]) &&
778 res->which == DATA1N_data &&
779 res->u.data.what == DATA1I_text)
781 assert (!res->u.data.data);
782 assert (res->u.data.len > 0);
783 if (res->u.data.len > DATA1_LOCALDATA)
784 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
786 res->u.data.data = res->lbuf;
787 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
792 static void variantBegin (struct lexSpec *spec,
793 const char *class_str, int class_len,
794 const char *type_str, int type_len,
795 const char *value_str, int value_len)
797 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
798 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
803 if (spec->d1_level == 0)
805 logf (LOG_WARN, "in variant begin. No record type defined");
808 if (class_len >= DATA1_MAX_SYMBOL)
809 class_len = DATA1_MAX_SYMBOL-1;
810 memcpy (tclass, class_str, class_len);
811 tclass[class_len] = '\0';
813 if (type_len >= DATA1_MAX_SYMBOL)
814 type_len = DATA1_MAX_SYMBOL-1;
815 memcpy (ttype, type_str, type_len);
816 ttype[type_len] = '\0';
819 logf (LOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
824 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
828 if (parent->which != DATA1N_variant)
830 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
831 if (spec->d1_stack[spec->d1_level])
832 tagDataRelease (spec);
833 spec->d1_stack[spec->d1_level] = res;
834 spec->d1_stack[++(spec->d1_level)] = NULL;
836 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
837 if (spec->d1_stack[i]->u.variant.type == tp)
844 logf (LOG_LOG, "variant node(%d)", spec->d1_level);
846 parent = spec->d1_stack[spec->d1_level-1];
847 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
848 res->u.variant.type = tp;
850 if (value_len >= DATA1_LOCALDATA)
851 value_len =DATA1_LOCALDATA-1;
852 memcpy (res->lbuf, value_str, value_len);
853 res->lbuf[value_len] = '\0';
855 res->u.variant.value = res->lbuf;
857 if (spec->d1_stack[spec->d1_level])
858 tagDataRelease (spec);
859 spec->d1_stack[spec->d1_level] = res;
860 spec->d1_stack[++(spec->d1_level)] = NULL;
863 static void tagStrip (const char **tag, int *len)
867 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
870 for (i = 0; i < *len && isspace((*tag)[i]); i++)
876 static void tagBegin (struct lexSpec *spec,
877 const char *tag, int len)
879 if (spec->d1_level == 0)
881 logf (LOG_WARN, "in element begin. No record type defined");
884 tagStrip (&tag, &len);
885 if (spec->d1_stack[spec->d1_level])
886 tagDataRelease (spec);
889 logf (LOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
892 spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
893 spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
894 spec->d1_stack[++(spec->d1_level)] = NULL;
897 static void tagEnd (struct lexSpec *spec, int min_level,
898 const char *tag, int len)
900 tagStrip (&tag, &len);
901 while (spec->d1_level > min_level)
903 tagDataRelease (spec);
905 if (spec->d1_level == 0)
907 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
909 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
911 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
915 logf (LOG_LOG, "end tag(%d)", spec->d1_level);
920 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
921 struct DFA *dfa, int greedy)
923 struct DFA_state *state = dfa->states[0];
926 unsigned char c_prev = 0;
927 int ptr = *pptr; /* current pointer */
928 int start_ptr = *pptr; /* first char of match */
929 int last_ptr = 0; /* last char of match */
930 int last_rule = 0; /* rule number of current match */
937 c = f_win_advance (spec, &ptr);
941 if (dfa->states[0] == state)
946 c = f_win_advance (spec, &ptr);
948 if (ptr == F_WIN_EOF)
962 if (--i < 0) /* no transition for character c */
966 *mptr = start_ptr; /* match starts here */
967 *pptr = last_ptr; /* match end here (+1) */
970 state = dfa->states[0];
973 c = f_win_advance (spec, &ptr);
979 else if (c >= t->ch[0] && c <= t->ch[1])
981 state = dfa->states[t->to];
982 if (state->rule_no && c_prev == '\n')
984 last_rule = state->rule_no;
987 else if (state->rule_nno)
989 last_rule = state->rule_nno;
1000 static int execTok (struct lexSpec *spec, const char **src,
1001 const char **tokBuf, int *tokLen)
1003 const char *s = *src;
1005 while (*s == ' ' || *s == '\t')
1009 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1013 while (*s >= '0' && *s <= '9')
1014 n = n*10 + (*s++ -'0');
1015 if (spec->arg_no == 0)
1022 if (n >= spec->arg_no)
1024 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1028 else if (*s == '\"')
1031 while (*s && *s != '\"')
1033 *tokLen = s - *tokBuf;
1038 else if (*s == '\n' || *s == ';')
1046 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1049 *tokLen = s - *tokBuf;
1056 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1059 *tokLen = s - *tokBuf;
1065 static char *regxStrz (const char *src, int len, char *str)
1069 memcpy (str, src, len);
1075 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1076 int argc, const char **argv)
1078 struct lexSpec *spec = (struct lexSpec *) clientData;
1081 if (!strcmp(argv[1], "record") && argc == 3)
1083 const char *absynName = argv[2];
1087 logf (LOG_LOG, "begin record %s", absynName);
1089 res = data1_mk_root (spec->dh, spec->m, absynName);
1093 spec->d1_stack[spec->d1_level++] = res;
1095 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1097 spec->d1_stack[spec->d1_level++] = res;
1099 spec->d1_stack[spec->d1_level] = NULL;
1101 else if (!strcmp(argv[1], "element") && argc == 3)
1103 tagBegin (spec, argv[2], strlen(argv[2]));
1105 else if (!strcmp (argv[1], "variant") && argc == 5)
1107 variantBegin (spec, argv[2], strlen(argv[2]),
1108 argv[3], strlen(argv[3]),
1109 argv[4], strlen(argv[4]));
1111 else if (!strcmp (argv[1], "context") && argc == 3)
1113 struct lexContext *lc = spec->context;
1115 logf (LOG_LOG, "begin context %s",argv[2]);
1117 while (lc && strcmp (argv[2], lc->name))
1121 spec->context_stack[++(spec->context_stack_top)] = lc;
1124 logf (LOG_WARN, "unknown context %s", argv[2]);
1131 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1132 int argc, const char **argv)
1134 struct lexSpec *spec = (struct lexSpec *) clientData;
1138 if (!strcmp (argv[1], "record"))
1140 while (spec->d1_level)
1142 tagDataRelease (spec);
1146 logf (LOG_LOG, "end record");
1148 spec->stop_flag = 1;
1150 else if (!strcmp (argv[1], "element"))
1153 const char *element = 0;
1154 if (argc >= 3 && !strcmp(argv[2], "-record"))
1163 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1164 if (spec->d1_level <= 1)
1167 logf (LOG_LOG, "end element end records");
1169 spec->stop_flag = 1;
1172 else if (!strcmp (argv[1], "context"))
1175 logf (LOG_LOG, "end context");
1177 if (spec->context_stack_top)
1178 (spec->context_stack_top)--;
1185 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1186 int argc, const char **argv)
1190 const char *element = 0;
1191 const char *attribute = 0;
1192 struct lexSpec *spec = (struct lexSpec *) clientData;
1196 if (!strcmp("-text", argv[argi]))
1201 else if (!strcmp("-element", argv[argi]))
1205 element = argv[argi++];
1207 else if (!strcmp("-attribute", argv[argi]))
1211 attribute = argv[argi++];
1217 tagBegin (spec, element, strlen(element));
1221 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1223 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1224 execData (spec, native, strlen(native), textFlag, attribute,
1225 attribute ? strlen(attribute) : 0);
1226 Tcl_DStringFree (&ds);
1228 execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1229 attribute ? strlen(attribute) : 0);
1234 tagEnd (spec, 2, NULL, 0);
1238 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1239 int argc, const char **argv)
1241 struct lexSpec *spec = (struct lexSpec *) clientData;
1248 if (!strcmp("-offset", argv[argi]))
1253 offset = atoi(argv[argi]);
1262 no = atoi(argv[argi]);
1263 if (no >= spec->arg_no)
1264 no = spec->arg_no - 1;
1265 spec->ptr = spec->arg_start[no] + offset;
1269 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1273 for (i = 0; i < spec->arg_no; i++)
1275 char var_name[10], *var_buf;
1278 sprintf (var_name, "%d", i);
1279 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1283 ch = var_buf[var_len];
1284 var_buf[var_len] = '\0';
1285 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1286 var_buf[var_len] = ch;
1289 #if HAVE_TCL_OBJECTS
1290 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1292 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1296 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1297 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1298 spec->tcl_interp->errorLine,
1299 spec->tcl_interp->result,
1300 err ? err : "[NO ERRORINFO]");
1306 static void execCode (struct lexSpec *spec, struct regxCode *code)
1308 const char *s = code->str;
1310 const char *cmd_str;
1312 r = execTok (spec, &s, &cmd_str, &cmd_len);
1319 r = execTok (spec, &s, &cmd_str, &cmd_len);
1322 p = regxStrz (cmd_str, cmd_len, ptmp);
1323 if (!strcmp (p, "begin"))
1325 r = execTok (spec, &s, &cmd_str, &cmd_len);
1328 logf (LOG_WARN, "missing keyword after 'begin'");
1331 p = regxStrz (cmd_str, cmd_len, ptmp);
1332 if (!strcmp (p, "record"))
1334 r = execTok (spec, &s, &cmd_str, &cmd_len);
1337 if (spec->d1_level <= 1)
1339 static char absynName[64];
1344 memcpy (absynName, cmd_str, cmd_len);
1345 absynName[cmd_len] = '\0';
1347 logf (LOG_LOG, "begin record %s", absynName);
1349 res = data1_mk_root (spec->dh, spec->m, absynName);
1353 spec->d1_stack[spec->d1_level++] = res;
1355 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1357 spec->d1_stack[spec->d1_level++] = res;
1359 spec->d1_stack[spec->d1_level] = NULL;
1361 r = execTok (spec, &s, &cmd_str, &cmd_len);
1363 else if (!strcmp (p, "element"))
1365 r = execTok (spec, &s, &cmd_str, &cmd_len);
1368 tagBegin (spec, cmd_str, cmd_len);
1369 r = execTok (spec, &s, &cmd_str, &cmd_len);
1371 else if (!strcmp (p, "variant"))
1374 const char *class_str = NULL;
1376 const char *type_str = NULL;
1378 const char *value_str = NULL;
1379 r = execTok (spec, &s, &cmd_str, &cmd_len);
1382 class_str = cmd_str;
1383 class_len = cmd_len;
1384 r = execTok (spec, &s, &cmd_str, &cmd_len);
1390 r = execTok (spec, &s, &cmd_str, &cmd_len);
1393 value_str = cmd_str;
1394 value_len = cmd_len;
1396 variantBegin (spec, class_str, class_len,
1397 type_str, type_len, value_str, value_len);
1400 r = execTok (spec, &s, &cmd_str, &cmd_len);
1402 else if (!strcmp (p, "context"))
1406 struct lexContext *lc = spec->context;
1407 r = execTok (spec, &s, &cmd_str, &cmd_len);
1408 p = regxStrz (cmd_str, cmd_len, ptmp);
1410 logf (LOG_LOG, "begin context %s", p);
1412 while (lc && strcmp (p, lc->name))
1415 spec->context_stack[++(spec->context_stack_top)] = lc;
1417 logf (LOG_WARN, "unknown context %s", p);
1420 r = execTok (spec, &s, &cmd_str, &cmd_len);
1424 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1427 else if (!strcmp (p, "end"))
1429 r = execTok (spec, &s, &cmd_str, &cmd_len);
1432 logf (LOG_WARN, "missing keyword after 'end'");
1435 p = regxStrz (cmd_str, cmd_len, ptmp);
1436 if (!strcmp (p, "record"))
1438 while (spec->d1_level)
1440 tagDataRelease (spec);
1443 r = execTok (spec, &s, &cmd_str, &cmd_len);
1445 logf (LOG_LOG, "end record");
1447 spec->stop_flag = 1;
1449 else if (!strcmp (p, "element"))
1452 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1454 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1459 tagEnd (spec, min_level, cmd_str, cmd_len);
1460 r = execTok (spec, &s, &cmd_str, &cmd_len);
1463 tagEnd (spec, min_level, NULL, 0);
1464 if (spec->d1_level <= 1)
1467 logf (LOG_LOG, "end element end records");
1469 spec->stop_flag = 1;
1473 else if (!strcmp (p, "context"))
1476 logf (LOG_LOG, "end context");
1478 if (spec->context_stack_top)
1479 (spec->context_stack_top)--;
1480 r = execTok (spec, &s, &cmd_str, &cmd_len);
1483 logf (LOG_WARN, "bad keyword '%s' after end", p);
1485 else if (!strcmp (p, "data"))
1489 const char *element_str = NULL;
1491 const char *attribute_str = NULL;
1493 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1495 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1497 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1499 r = execTok (spec, &s, &element_str, &element_len);
1503 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1506 r = execTok (spec, &s, &attribute_str, &attribute_len);
1511 logf (LOG_WARN, "bad data option: %.*s",
1516 logf (LOG_WARN, "missing data item after data");
1520 tagBegin (spec, element_str, element_len);
1523 execData (spec, cmd_str, cmd_len, textFlag,
1524 attribute_str, attribute_len);
1525 r = execTok (spec, &s, &cmd_str, &cmd_len);
1528 tagEnd (spec, 2, NULL, 0);
1530 else if (!strcmp (p, "unread"))
1533 r = execTok (spec, &s, &cmd_str, &cmd_len);
1534 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1536 r = execTok (spec, &s, &cmd_str, &cmd_len);
1539 logf (LOG_WARN, "missing number after -offset");
1542 p = regxStrz (cmd_str, cmd_len, ptmp);
1544 r = execTok (spec, &s, &cmd_str, &cmd_len);
1550 logf (LOG_WARN, "missing index after unread command");
1553 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1555 logf (LOG_WARN, "bad index after unread command");
1560 no = *cmd_str - '0';
1561 if (no >= spec->arg_no)
1562 no = spec->arg_no - 1;
1563 spec->ptr = spec->arg_start[no] + offset;
1565 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 else if (!strcmp (p, "context"))
1571 struct lexContext *lc = spec->context;
1572 r = execTok (spec, &s, &cmd_str, &cmd_len);
1573 p = regxStrz (cmd_str, cmd_len, ptmp);
1575 while (lc && strcmp (p, lc->name))
1578 spec->context_stack[spec->context_stack_top] = lc;
1580 logf (LOG_WARN, "unknown context %s", p);
1583 r = execTok (spec, &s, &cmd_str, &cmd_len);
1587 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1588 r = execTok (spec, &s, &cmd_str, &cmd_len);
1593 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1595 r = execTok (spec, &s, &cmd_str, &cmd_len);
1602 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1603 int start_ptr, int *pptr)
1612 arg_start[0] = start_ptr;
1614 spec->arg_start = arg_start;
1615 spec->arg_end = arg_end;
1622 if (ap->u.pattern.body)
1624 arg_start[arg_no] = *pptr;
1625 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1627 arg_end[arg_no] = F_WIN_EOF;
1629 arg_start[arg_no] = F_WIN_EOF;
1630 arg_end[arg_no] = F_WIN_EOF;
1631 yaz_log(LOG_DEBUG, "Pattern match rest of record");
1636 arg_end[arg_no] = sptr;
1638 arg_start[arg_no] = sptr;
1639 arg_end[arg_no] = *pptr;
1644 arg_start[arg_no] = *pptr;
1645 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1647 if (sptr != arg_start[arg_no])
1649 arg_end[arg_no] = *pptr;
1654 spec->arg_no = arg_no;
1657 if (spec->tcl_interp)
1658 execTcl(spec, ap->u.code);
1660 execCode (spec, ap->u.code);
1662 execCode (spec, ap->u.code);
1665 if (spec->stop_flag)
1669 arg_start[arg_no] = *pptr;
1670 arg_end[arg_no] = F_WIN_EOF;
1679 static int execRule (struct lexSpec *spec, struct lexContext *context,
1680 int ruleNo, int start_ptr, int *pptr)
1683 logf (LOG_LOG, "exec rule %d", ruleNo);
1685 return execAction (spec, context->fastRule[ruleNo]->actionList,
1689 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1691 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1692 struct DFA_state *state = context->dfa->states[0];
1695 unsigned char c_prev = '\n';
1697 int last_rule = 0; /* rule number of current match */
1698 int last_ptr = *ptr; /* last char of match */
1699 int start_ptr = *ptr; /* first char of match */
1700 int skip_ptr = *ptr; /* first char of run */
1704 c = f_win_advance (spec, ptr);
1705 if (*ptr == F_WIN_EOF)
1707 /* end of file met */
1710 /* there was a match */
1711 if (skip_ptr < start_ptr)
1713 /* deal with chars that didn't match */
1716 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1717 execDataP (spec, buf, size, 0);
1719 /* restore pointer */
1722 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1724 /* restore skip pointer */
1728 else if (skip_ptr < *ptr)
1730 /* deal with chars that didn't match */
1733 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1734 execDataP (spec, buf, size, 0);
1736 if (*ptr == F_WIN_EOF)
1743 { /* no transition for character c ... */
1746 if (skip_ptr < start_ptr)
1748 /* deal with chars that didn't match */
1751 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1752 execDataP (spec, buf, size, 0);
1754 /* restore pointer */
1756 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1758 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1761 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1763 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1767 context = spec->context_stack[spec->context_stack_top];
1770 last_ptr = start_ptr = *ptr;
1774 c_prev = f_win_advance (spec, &start_ptr);
1779 c_prev = f_win_advance (spec, &start_ptr);
1782 state = context->dfa->states[0];
1785 else if (c >= t->ch[0] && c <= t->ch[1])
1786 { /* transition ... */
1787 state = context->dfa->states[t->to];
1792 last_rule = state->rule_no;
1795 else if (state->rule_nno)
1797 last_rule = state->rule_nno;
1809 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1810 const char *context_name)
1812 struct lexContext *lt = spec->context;
1815 spec->stop_flag = 0;
1817 spec->context_stack_top = 0;
1820 if (!strcmp (lt->name, context_name))
1826 logf (LOG_WARN, "cannot find context %s", context_name);
1829 spec->context_stack[spec->context_stack_top] = lt;
1830 spec->d1_stack[spec->d1_level] = NULL;
1835 execAction (spec, lt->initActionList, ptr, &ptr);
1838 execAction (spec, lt->beginActionList, ptr, &ptr);
1839 lexNode (spec, &ptr);
1840 while (spec->d1_level)
1842 tagDataRelease (spec);
1845 execAction (spec, lt->endActionList, ptr, &ptr);
1846 return spec->d1_stack[0];
1849 void grs_destroy(void *clientData)
1851 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1854 lexSpecDestroy(&specs->spec);
1859 void *grs_init(void)
1861 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1866 data1_node *grs_read_regx (struct grs_read_info *p)
1869 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1870 struct lexSpec **curLexSpec = &specs->spec;
1873 logf (LOG_LOG, "grs_read_regx");
1875 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1878 lexSpecDestroy (curLexSpec);
1879 *curLexSpec = lexSpecCreate (p->type, p->dh);
1880 res = readFileSpec (*curLexSpec);
1883 lexSpecDestroy (curLexSpec);
1887 (*curLexSpec)->dh = p->dh;
1890 (*curLexSpec)->f_win_start = 0;
1891 (*curLexSpec)->f_win_end = 0;
1892 (*curLexSpec)->f_win_rf = p->readf;
1893 (*curLexSpec)->f_win_sf = p->seekf;
1894 (*curLexSpec)->f_win_fh = p->fh;
1895 (*curLexSpec)->f_win_ef = p->endf;
1896 (*curLexSpec)->f_win_size = 500000;
1898 (*curLexSpec)->m = p->mem;
1899 return lexRoot (*curLexSpec, p->offset, "main");
1902 static struct recTypeGrs regx_type = {
1909 RecTypeGrs recTypeGrs_regx = ®x_type;
1912 data1_node *grs_read_tcl (struct grs_read_info *p)
1915 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1916 struct lexSpec **curLexSpec = &specs->spec;
1919 logf (LOG_LOG, "grs_read_tcl");
1921 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1923 Tcl_Interp *tcl_interp;
1925 lexSpecDestroy (curLexSpec);
1926 *curLexSpec = lexSpecCreate (p->type, p->dh);
1927 Tcl_FindExecutable("");
1928 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1929 Tcl_Init(tcl_interp);
1930 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1931 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1932 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1933 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1935 res = readFileSpec (*curLexSpec);
1938 lexSpecDestroy (curLexSpec);
1942 (*curLexSpec)->dh = p->dh;
1945 (*curLexSpec)->f_win_start = 0;
1946 (*curLexSpec)->f_win_end = 0;
1947 (*curLexSpec)->f_win_rf = p->readf;
1948 (*curLexSpec)->f_win_sf = p->seekf;
1949 (*curLexSpec)->f_win_fh = p->fh;
1950 (*curLexSpec)->f_win_ef = p->endf;
1951 (*curLexSpec)->f_win_size = 500000;
1953 (*curLexSpec)->m = p->mem;
1954 return lexRoot (*curLexSpec, p->offset, "main");
1957 static struct recTypeGrs tcl_type = {
1964 RecTypeGrs recTypeGrs_tcl = &tcl_type;