2 * Copyright (C) 1994-2002, Index Data
5 * $Id: regxread.c,v 1.43 2002-07-05 12:43:30 adam Exp $
12 #include <yaz/tpath.h>
20 #if MAJOR_VERSION >= 8
21 #define HAVE_TCL_OBJECTS
27 #define F_WIN_EOF 2000000000
31 #define REGX_PATTERN 1
36 #define REGX_CONTEXT 6
46 struct lexRuleAction {
50 struct DFA *dfa; /* REGX_PATTERN */
53 struct regxCode *code; /* REGX_CODE */
55 struct lexRuleAction *next;
60 struct lexRuleAction *actionList;
64 struct lexRuleInfo info;
71 struct lexRule *rules;
72 struct lexRuleInfo **fastRule;
76 struct lexRuleAction *beginActionList;
77 struct lexRuleAction *endActionList;
78 struct lexRuleAction *initActionList;
79 struct lexContext *next;
89 struct lexContext *context;
91 struct lexContext **context_stack;
92 int context_stack_size;
93 int context_stack_top;
99 Tcl_Interp *tcl_interp;
102 void (*f_win_ef)(void *, off_t);
104 int f_win_start; /* first byte of buffer is this file offset */
105 int f_win_end; /* last byte of buffer is this offset - 1 */
106 int f_win_size; /* size of buffer */
107 char *f_win_buf; /* buffer itself */
108 int (*f_win_rf)(void *, char *, size_t);
109 off_t (*f_win_sf)(void *, off_t);
111 struct lexConcatBuf *concatBuf;
113 data1_node **d1_stack;
124 struct lexSpec *spec;
127 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
130 int i, r, off = start_pos - spec->f_win_start;
132 if (off >= 0 && end_pos <= spec->f_win_end)
134 *size = end_pos - start_pos;
135 return spec->f_win_buf + off;
137 if (off < 0 || start_pos >= spec->f_win_end)
139 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
140 spec->f_win_start = start_pos;
142 if (!spec->f_win_buf)
143 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
144 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
146 spec->f_win_end = spec->f_win_start + *size;
148 if (*size > end_pos - start_pos)
149 *size = end_pos - start_pos;
150 return spec->f_win_buf;
152 for (i = 0; i<spec->f_win_end - start_pos; i++)
153 spec->f_win_buf[i] = spec->f_win_buf[i + off];
154 r = (*spec->f_win_rf)(spec->f_win_fh,
156 spec->f_win_size - i);
157 spec->f_win_start = start_pos;
158 spec->f_win_end += r;
160 if (*size > end_pos - start_pos)
161 *size = end_pos - start_pos;
162 return spec->f_win_buf;
165 static int f_win_advance (struct lexSpec *spec, int *pos)
170 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
171 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
172 if (*pos == F_WIN_EOF)
174 buf = f_win_get (spec, *pos, *pos+1, &size);
184 static void regxCodeDel (struct regxCode **pp)
186 struct regxCode *p = *pp;
191 Tcl_DecrRefCount (p->tcl_obj);
199 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
203 p = (struct regxCode *) xmalloc (sizeof(*p));
204 p->str = (char *) xmalloc (len+1);
205 memcpy (p->str, buf, len);
208 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
210 Tcl_IncrRefCount (p->tcl_obj);
215 static struct DFA *lexSpecDFA (void)
220 dfa_parse_cmap_del (dfa, ' ');
221 dfa_parse_cmap_del (dfa, '\t');
222 dfa_parse_cmap_add (dfa, '/', 0);
226 static void actionListDel (struct lexRuleAction **rap)
228 struct lexRuleAction *ra1, *ra;
230 for (ra = *rap; ra; ra = ra1)
236 dfa_delete (&ra->u.pattern.dfa);
239 regxCodeDel (&ra->u.code);
247 static struct lexContext *lexContextCreate (const char *name)
249 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
251 p->name = xstrdup (name);
254 p->dfa = lexSpecDFA ();
257 p->beginActionList = NULL;
258 p->endActionList = NULL;
259 p->initActionList = NULL;
264 static void lexContextDestroy (struct lexContext *p)
266 struct lexRule *rp, *rp1;
268 dfa_delete (&p->dfa);
270 for (rp = p->rules; rp; rp = rp1)
273 actionListDel (&rp->info.actionList);
276 actionListDel (&p->beginActionList);
277 actionListDel (&p->endActionList);
278 actionListDel (&p->initActionList);
283 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
288 p = (struct lexSpec *) xmalloc (sizeof(*p));
289 p->name = (char *) xmalloc (strlen(name)+1);
290 strcpy (p->name, name);
297 p->context_stack_size = 100;
298 p->context_stack = (struct lexContext **)
299 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
303 p->concatBuf = (struct lexConcatBuf *)
304 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
305 for (i = 0; i < p->maxLevel; i++)
307 p->concatBuf[i].max = 0;
308 p->concatBuf[i].buf = 0;
310 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
315 static void lexSpecDestroy (struct lexSpec **pp)
318 struct lexContext *lt;
326 for (i = 0; i < p->maxLevel; i++)
327 xfree (p->concatBuf[i].buf);
328 xfree (p->concatBuf);
333 struct lexContext *lt_next = lt->next;
334 lexContextDestroy (lt);
339 Tcl_DeleteInterp (p->tcl_interp);
342 xfree (p->f_win_buf);
343 xfree (p->context_stack);
349 static int readParseToken (const char **cpp, int *len)
351 const char *cp = *cpp;
355 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
384 if (*cp >= 'a' && *cp <= 'z')
386 else if (*cp >= 'A' && *cp <= 'Z')
387 cmd[i] = *cp + 'a' - 'A';
390 if (i < (int) sizeof(cmd)-2)
397 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
399 while (*cp && *cp != ' ' && *cp != '\t' &&
400 *cp != '\n' && *cp != '\r')
406 if (!strcmp (cmd, "begin"))
408 else if (!strcmp (cmd, "end"))
410 else if (!strcmp (cmd, "body"))
412 else if (!strcmp (cmd, "context"))
414 else if (!strcmp (cmd, "init"))
418 logf (LOG_WARN, "bad command %s", cmd);
424 static int actionListMk (struct lexSpec *spec, const char *s,
425 struct lexRuleAction **ap)
431 while ((tok = readParseToken (&s, &len)))
439 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
441 regxCodeMk (&(*ap)->u.code, s, len);
445 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
447 (*ap)->u.pattern.body = bodyMark;
449 (*ap)->u.pattern.dfa = lexSpecDFA ();
451 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
456 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
459 dfa_mkstate ((*ap)->u.pattern.dfa);
463 logf (LOG_WARN, "cannot use BEGIN here");
466 logf (LOG_WARN, "cannot use INIT here");
469 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
479 int readOneSpec (struct lexSpec *spec, const char *s)
483 struct lexContext *lc;
485 tok = readParseToken (&s, &len);
486 if (tok == REGX_CONTEXT)
488 char context_name[32];
489 tok = readParseToken (&s, &len);
490 if (tok != REGX_CODE)
492 logf (LOG_WARN, "missing name after CONTEXT keyword");
497 memcpy (context_name, s, len);
498 context_name[len] = '\0';
499 lc = lexContextCreate (context_name);
500 lc->next = spec->context;
505 spec->context = lexContextCreate ("main");
510 actionListDel (&spec->context->beginActionList);
511 actionListMk (spec, s, &spec->context->beginActionList);
514 actionListDel (&spec->context->endActionList);
515 actionListMk (spec, s, &spec->context->endActionList);
518 actionListDel (&spec->context->initActionList);
519 actionListMk (spec, s, &spec->context->initActionList);
523 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
525 r = dfa_parse (spec->context->dfa, &s);
528 logf (LOG_WARN, "regular expression error. r=%d", r);
533 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
537 rp = (struct lexRule *) xmalloc (sizeof(*rp));
538 rp->info.no = spec->context->ruleNo++;
539 rp->next = spec->context->rules;
540 spec->context->rules = rp;
541 actionListMk (spec, s, &rp->info.actionList);
546 int readFileSpec (struct lexSpec *spec)
548 struct lexContext *lc;
549 int c, i, errors = 0;
555 if (spec->tcl_interp)
557 sprintf (fname, "%s.tflt", spec->name);
558 spec_inf = data1_path_fopen (spec->dh, fname, "r");
563 sprintf (fname, "%s.flt", spec->name);
564 spec_inf = data1_path_fopen (spec->dh, fname, "r");
568 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
571 logf (LOG_LOG, "reading regx filter %s", fname);
573 if (spec->tcl_interp)
574 logf (LOG_LOG, "Tcl enabled");
576 lineBuf = wrbuf_alloc();
581 wrbuf_rewind (lineBuf);
582 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
584 while (c != '\n' && c != EOF)
597 wrbuf_putc(lineBuf, c);
605 if (c != ' ' && c != '\t')
610 wrbuf_putc(lineBuf, '\0');
611 readOneSpec (spec, wrbuf_buf(lineBuf));
612 spec->lineNo += addLine;
616 wrbuf_free(lineBuf, 1);
621 debug_dfa_followpos = 1;
624 for (lc = spec->context; lc; lc = lc->next)
627 lc->fastRule = (struct lexRuleInfo **)
628 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
629 for (i = 0; i < lc->ruleNo; i++)
630 lc->fastRule[i] = NULL;
631 for (rp = lc->rules; rp; rp = rp->next)
632 lc->fastRule[rp->info.no] = &rp->info;
633 dfa_mkstate (lc->dfa);
642 static struct lexSpec *curLexSpec = NULL;
645 static void execData (struct lexSpec *spec,
646 const char *ebuf, int elen, int formatted_text)
648 struct data1_node *res, *parent;
651 if (elen == 0) /* shouldn't happen, but it does! */
655 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
656 ebuf, 15, ebuf + elen-15);
658 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
660 logf (LOG_LOG, "data (%d bytes)", elen);
663 if (spec->d1_level <= 1)
666 parent = spec->d1_stack[spec->d1_level -1];
669 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
670 org_len = res->u.data.len;
675 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
676 res->u.data.what = DATA1I_text;
678 res->u.data.formatted_text = formatted_text;
679 res->u.data.data = 0;
681 if (spec->d1_stack[spec->d1_level])
682 spec->d1_stack[spec->d1_level]->next = res;
683 spec->d1_stack[spec->d1_level] = res;
685 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
687 char *old_buf, *new_buf;
689 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
690 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
691 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
693 memcpy (new_buf, old_buf, org_len);
696 spec->concatBuf[spec->d1_level].buf = new_buf;
698 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
699 res->u.data.len += elen;
702 static void execDataP (struct lexSpec *spec,
703 const char *ebuf, int elen, int formatted_text)
705 execData (spec, ebuf, elen, formatted_text);
708 static void tagDataRelease (struct lexSpec *spec)
712 if ((res = spec->d1_stack[spec->d1_level]) &&
713 res->which == DATA1N_data &&
714 res->u.data.what == DATA1I_text)
716 assert (!res->u.data.data);
717 assert (res->u.data.len > 0);
718 if (res->u.data.len > DATA1_LOCALDATA)
719 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
721 res->u.data.data = res->lbuf;
722 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
727 static void variantBegin (struct lexSpec *spec,
728 const char *class_str, int class_len,
729 const char *type_str, int type_len,
730 const char *value_str, int value_len)
732 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
733 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
738 if (spec->d1_level == 0)
740 logf (LOG_WARN, "in variant begin. No record type defined");
743 if (class_len >= DATA1_MAX_SYMBOL)
744 class_len = DATA1_MAX_SYMBOL-1;
745 memcpy (tclass, class_str, class_len);
746 tclass[class_len] = '\0';
748 if (type_len >= DATA1_MAX_SYMBOL)
749 type_len = DATA1_MAX_SYMBOL-1;
750 memcpy (ttype, type_str, type_len);
751 ttype[type_len] = '\0';
754 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
759 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
763 if (parent->which != DATA1N_variant)
765 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
766 if (spec->d1_stack[spec->d1_level])
767 tagDataRelease (spec);
768 spec->d1_stack[spec->d1_level] = res;
769 spec->d1_stack[++(spec->d1_level)] = NULL;
771 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
772 if (spec->d1_stack[i]->u.variant.type == tp)
779 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
781 parent = spec->d1_stack[spec->d1_level-1];
782 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
783 res->u.variant.type = tp;
785 if (value_len >= DATA1_LOCALDATA)
786 value_len =DATA1_LOCALDATA-1;
787 memcpy (res->lbuf, value_str, value_len);
788 res->lbuf[value_len] = '\0';
790 res->u.variant.value = res->lbuf;
792 if (spec->d1_stack[spec->d1_level])
793 tagDataRelease (spec);
794 spec->d1_stack[spec->d1_level] = res;
795 spec->d1_stack[++(spec->d1_level)] = NULL;
798 static void tagStrip (const char **tag, int *len)
802 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
805 for (i = 0; i < *len && isspace((*tag)[i]); i++)
811 static void tagBegin (struct lexSpec *spec,
812 const char *tag, int len)
814 struct data1_node *parent;
815 data1_element *elem = NULL;
818 data1_element *e = NULL;
821 if (spec->d1_level == 0)
823 logf (LOG_WARN, "in element begin. No record type defined");
826 tagStrip (&tag, &len);
828 parent = spec->d1_stack[spec->d1_level -1];
829 partag = get_parent_tag(spec->dh, parent);
831 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_tag, parent);
833 if (len >= DATA1_LOCALDATA)
834 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
836 res->u.tag.tag = res->lbuf;
838 memcpy (res->u.tag.tag, tag, len);
839 res->u.tag.tag[len] = '\0';
842 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
844 if (parent->which == DATA1N_variant)
847 if (!(e = partag->u.tag.element))
850 elem = data1_getelementbytagname (spec->dh,
851 spec->d1_stack[0]->u.root.absyn,
853 res->u.tag.element = elem;
855 if (spec->d1_stack[spec->d1_level])
856 tagDataRelease (spec);
857 spec->d1_stack[spec->d1_level] = res;
858 spec->d1_stack[++(spec->d1_level)] = NULL;
861 static void tagEnd (struct lexSpec *spec, int min_level,
862 const char *tag, int len)
864 tagStrip (&tag, &len);
865 while (spec->d1_level > min_level)
867 tagDataRelease (spec);
869 if (spec->d1_level == 0)
871 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
873 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
875 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
879 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
884 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
887 struct DFA_state *state = dfa->states[0];
890 unsigned char c_prev = 0;
891 int ptr = *pptr; /* current pointer */
892 int start_ptr = *pptr; /* first char of match */
893 int last_ptr = 0; /* last char of match */
894 int last_rule = 0; /* rule number of current match */
899 c = f_win_advance (spec, &ptr);
900 if (ptr == F_WIN_EOF)
917 *mptr = start_ptr; /* match starts here */
918 *pptr = last_ptr; /* match end here (+1) */
921 state = dfa->states[0];
926 else if (c >= t->ch[0] && c <= t->ch[1])
928 state = dfa->states[t->to];
933 last_rule = state->rule_no;
938 last_rule = state->rule_nno;
950 static int execTok (struct lexSpec *spec, const char **src,
951 const char **tokBuf, int *tokLen)
953 const char *s = *src;
955 while (*s == ' ' || *s == '\t')
959 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
963 while (*s >= '0' && *s <= '9')
964 n = n*10 + (*s++ -'0');
965 if (spec->arg_no == 0)
972 if (n >= spec->arg_no)
974 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
981 while (*s && *s != '\"')
983 *tokLen = s - *tokBuf;
988 else if (*s == '\n' || *s == ';')
996 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
999 *tokLen = s - *tokBuf;
1006 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1009 *tokLen = s - *tokBuf;
1015 static char *regxStrz (const char *src, int len, char *str)
1019 memcpy (str, src, len);
1025 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1026 int argc, char **argv)
1028 struct lexSpec *spec = (struct lexSpec *) clientData;
1031 if (!strcmp(argv[1], "record") && argc == 3)
1033 char *absynName = argv[2];
1037 logf (LOG_LOG, "begin record %s", absynName);
1039 res = data1_mk_root (spec->dh, spec->m, absynName);
1041 spec->d1_stack[spec->d1_level++] = res;
1043 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1045 spec->d1_stack[spec->d1_level++] = res;
1047 spec->d1_stack[spec->d1_level] = NULL;
1049 else if (!strcmp(argv[1], "element") && argc == 3)
1051 tagBegin (spec, argv[2], strlen(argv[2]));
1053 else if (!strcmp (argv[1], "variant") && argc == 5)
1055 variantBegin (spec, argv[2], strlen(argv[2]),
1056 argv[3], strlen(argv[3]),
1057 argv[4], strlen(argv[4]));
1059 else if (!strcmp (argv[1], "context") && argc == 3)
1061 struct lexContext *lc = spec->context;
1063 logf (LOG_LOG, "begin context %s",argv[2]);
1065 while (lc && strcmp (argv[2], lc->name))
1069 spec->context_stack[++(spec->context_stack_top)] = lc;
1072 logf (LOG_WARN, "unknown context %s", argv[2]);
1079 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1080 int argc, char **argv)
1082 struct lexSpec *spec = (struct lexSpec *) clientData;
1086 if (!strcmp (argv[1], "record"))
1088 while (spec->d1_level)
1090 tagDataRelease (spec);
1094 logf (LOG_LOG, "end record");
1096 spec->stop_flag = 1;
1098 else if (!strcmp (argv[1], "element"))
1102 if (argc >= 3 && !strcmp(argv[2], "-record"))
1111 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1112 if (spec->d1_level == 0)
1115 logf (LOG_LOG, "end element end records");
1117 spec->stop_flag = 1;
1120 else if (!strcmp (argv[1], "context"))
1123 logf (LOG_LOG, "end context");
1125 if (spec->context_stack_top)
1126 (spec->context_stack_top)--;
1133 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1134 int argc, char **argv)
1138 const char *element = 0;
1139 struct lexSpec *spec = (struct lexSpec *) clientData;
1143 if (!strcmp("-text", argv[argi]))
1148 else if (!strcmp("-element", argv[argi]))
1152 element = argv[argi++];
1158 tagBegin (spec, element, strlen(element));
1162 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1164 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1165 execData (spec, native, strlen(native), textFlag);
1166 Tcl_DStringFree (&ds);
1168 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1173 tagEnd (spec, 1, NULL, 0);
1177 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1178 int argc, char **argv)
1180 struct lexSpec *spec = (struct lexSpec *) clientData;
1187 if (!strcmp("-offset", argv[argi]))
1192 offset = atoi(argv[argi]);
1201 no = atoi(argv[argi]);
1202 if (no >= spec->arg_no)
1203 no = spec->arg_no - 1;
1204 spec->ptr = spec->arg_start[no] + offset;
1208 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1212 for (i = 0; i < spec->arg_no; i++)
1214 char var_name[10], *var_buf;
1217 sprintf (var_name, "%d", i);
1218 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1222 ch = var_buf[var_len];
1223 var_buf[var_len] = '\0';
1224 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1225 var_buf[var_len] = ch;
1228 #if HAVE_TCL_OBJECTS
1229 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1231 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1235 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1236 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1237 spec->tcl_interp->errorLine,
1238 spec->tcl_interp->result,
1239 err ? err : "[NO ERRORINFO]");
1245 static void execCode (struct lexSpec *spec, struct regxCode *code)
1247 const char *s = code->str;
1249 const char *cmd_str;
1251 r = execTok (spec, &s, &cmd_str, &cmd_len);
1258 r = execTok (spec, &s, &cmd_str, &cmd_len);
1261 p = regxStrz (cmd_str, cmd_len, ptmp);
1262 if (!strcmp (p, "begin"))
1264 r = execTok (spec, &s, &cmd_str, &cmd_len);
1267 logf (LOG_WARN, "missing keyword after 'begin'");
1270 p = regxStrz (cmd_str, cmd_len, ptmp);
1271 if (!strcmp (p, "record"))
1273 r = execTok (spec, &s, &cmd_str, &cmd_len);
1276 if (spec->d1_level == 0)
1278 static char absynName[64];
1283 memcpy (absynName, cmd_str, cmd_len);
1284 absynName[cmd_len] = '\0';
1286 logf (LOG_LOG, "begin record %s", absynName);
1288 res = data1_mk_root (spec->dh, spec->m, absynName);
1290 spec->d1_stack[spec->d1_level++] = res;
1292 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1294 spec->d1_stack[spec->d1_level++] = res;
1296 spec->d1_stack[spec->d1_level] = NULL;
1298 r = execTok (spec, &s, &cmd_str, &cmd_len);
1300 else if (!strcmp (p, "element"))
1302 r = execTok (spec, &s, &cmd_str, &cmd_len);
1305 tagBegin (spec, cmd_str, cmd_len);
1306 r = execTok (spec, &s, &cmd_str, &cmd_len);
1308 else if (!strcmp (p, "variant"))
1311 const char *class_str = NULL;
1313 const char *type_str = NULL;
1315 const char *value_str = NULL;
1316 r = execTok (spec, &s, &cmd_str, &cmd_len);
1319 class_str = cmd_str;
1320 class_len = cmd_len;
1321 r = execTok (spec, &s, &cmd_str, &cmd_len);
1327 r = execTok (spec, &s, &cmd_str, &cmd_len);
1330 value_str = cmd_str;
1331 value_len = cmd_len;
1333 variantBegin (spec, class_str, class_len,
1334 type_str, type_len, value_str, value_len);
1337 r = execTok (spec, &s, &cmd_str, &cmd_len);
1339 else if (!strcmp (p, "context"))
1343 struct lexContext *lc = spec->context;
1344 r = execTok (spec, &s, &cmd_str, &cmd_len);
1345 p = regxStrz (cmd_str, cmd_len, ptmp);
1347 logf (LOG_LOG, "begin context %s", p);
1349 while (lc && strcmp (p, lc->name))
1352 spec->context_stack[++(spec->context_stack_top)] = lc;
1354 logf (LOG_WARN, "unknown context %s", p);
1357 r = execTok (spec, &s, &cmd_str, &cmd_len);
1361 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1364 else if (!strcmp (p, "end"))
1366 r = execTok (spec, &s, &cmd_str, &cmd_len);
1369 logf (LOG_WARN, "missing keyword after 'end'");
1372 p = regxStrz (cmd_str, cmd_len, ptmp);
1373 if (!strcmp (p, "record"))
1375 while (spec->d1_level)
1377 tagDataRelease (spec);
1380 r = execTok (spec, &s, &cmd_str, &cmd_len);
1382 logf (LOG_LOG, "end record");
1384 spec->stop_flag = 1;
1386 else if (!strcmp (p, "element"))
1389 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1391 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1396 tagEnd (spec, min_level, cmd_str, cmd_len);
1397 r = execTok (spec, &s, &cmd_str, &cmd_len);
1400 tagEnd (spec, min_level, NULL, 0);
1401 if (spec->d1_level == 0)
1404 logf (LOG_LOG, "end element end records");
1406 spec->stop_flag = 1;
1410 else if (!strcmp (p, "context"))
1413 logf (LOG_LOG, "end context");
1415 if (spec->context_stack_top)
1416 (spec->context_stack_top)--;
1417 r = execTok (spec, &s, &cmd_str, &cmd_len);
1420 logf (LOG_WARN, "bad keyword '%s' after end", p);
1422 else if (!strcmp (p, "data"))
1426 const char *element_str = NULL;
1428 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1430 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1432 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1434 r = execTok (spec, &s, &element_str, &element_len);
1439 logf (LOG_WARN, "bad data option: %.*s",
1444 logf (LOG_WARN, "missing data item after data");
1448 tagBegin (spec, element_str, element_len);
1451 execData (spec, cmd_str, cmd_len,textFlag);
1452 r = execTok (spec, &s, &cmd_str, &cmd_len);
1455 tagEnd (spec, 1, NULL, 0);
1457 else if (!strcmp (p, "unread"))
1460 r = execTok (spec, &s, &cmd_str, &cmd_len);
1461 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1463 r = execTok (spec, &s, &cmd_str, &cmd_len);
1466 logf (LOG_WARN, "missing number after -offset");
1469 p = regxStrz (cmd_str, cmd_len, ptmp);
1471 r = execTok (spec, &s, &cmd_str, &cmd_len);
1477 logf (LOG_WARN, "missing index after unread command");
1480 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1482 logf (LOG_WARN, "bad index after unread command");
1487 no = *cmd_str - '0';
1488 if (no >= spec->arg_no)
1489 no = spec->arg_no - 1;
1490 spec->ptr = spec->arg_start[no] + offset;
1492 r = execTok (spec, &s, &cmd_str, &cmd_len);
1494 else if (!strcmp (p, "context"))
1498 struct lexContext *lc = spec->context;
1499 r = execTok (spec, &s, &cmd_str, &cmd_len);
1500 p = regxStrz (cmd_str, cmd_len, ptmp);
1502 while (lc && strcmp (p, lc->name))
1505 spec->context_stack[spec->context_stack_top] = lc;
1507 logf (LOG_WARN, "unknown context %s", p);
1510 r = execTok (spec, &s, &cmd_str, &cmd_len);
1514 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1515 r = execTok (spec, &s, &cmd_str, &cmd_len);
1520 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1522 r = execTok (spec, &s, &cmd_str, &cmd_len);
1529 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1530 int start_ptr, int *pptr)
1539 arg_start[0] = start_ptr;
1541 spec->arg_start = arg_start;
1542 spec->arg_end = arg_end;
1549 if (ap->u.pattern.body)
1551 arg_start[arg_no] = *pptr;
1552 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1554 arg_end[arg_no] = F_WIN_EOF;
1556 arg_start[arg_no] = F_WIN_EOF;
1557 arg_end[arg_no] = F_WIN_EOF;
1562 arg_end[arg_no] = sptr;
1564 arg_start[arg_no] = sptr;
1565 arg_end[arg_no] = *pptr;
1570 arg_start[arg_no] = *pptr;
1571 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1573 if (sptr != arg_start[arg_no])
1575 arg_end[arg_no] = *pptr;
1580 spec->arg_no = arg_no;
1583 if (spec->tcl_interp)
1584 execTcl(spec, ap->u.code);
1586 execCode (spec, ap->u.code);
1588 execCode (spec, ap->u.code);
1591 if (spec->stop_flag)
1595 arg_start[arg_no] = *pptr;
1596 arg_end[arg_no] = F_WIN_EOF;
1605 static int execRule (struct lexSpec *spec, struct lexContext *context,
1606 int ruleNo, int start_ptr, int *pptr)
1609 logf (LOG_LOG, "exec rule %d", ruleNo);
1611 return execAction (spec, context->fastRule[ruleNo]->actionList,
1615 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1617 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1618 struct DFA_state *state = context->dfa->states[0];
1621 unsigned char c_prev = '\n';
1623 int last_rule = 0; /* rule number of current match */
1624 int last_ptr = *ptr; /* last char of match */
1625 int start_ptr = *ptr; /* first char of match */
1626 int skip_ptr = *ptr; /* first char of run */
1630 c = f_win_advance (spec, ptr);
1631 if (*ptr == F_WIN_EOF)
1633 /* end of file met */
1636 /* there was a match */
1637 if (skip_ptr < start_ptr)
1639 /* deal with chars that didn't match */
1642 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1643 execDataP (spec, buf, size, 0);
1645 /* restore pointer */
1648 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1650 /* restore skip pointer */
1654 else if (skip_ptr < *ptr)
1656 /* deal with chars that didn't match */
1659 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1660 execDataP (spec, buf, size, 0);
1662 if (*ptr == F_WIN_EOF)
1669 { /* no transition for character c ... */
1672 if (skip_ptr < start_ptr)
1674 /* deal with chars that didn't match */
1677 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1678 execDataP (spec, buf, size, 0);
1680 /* restore pointer */
1682 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1684 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1687 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1689 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1693 context = spec->context_stack[spec->context_stack_top];
1696 last_ptr = start_ptr = *ptr;
1700 c_prev = f_win_advance (spec, &start_ptr);
1705 c_prev = f_win_advance (spec, &start_ptr);
1708 state = context->dfa->states[0];
1711 else if (c >= t->ch[0] && c <= t->ch[1])
1712 { /* transition ... */
1713 state = context->dfa->states[t->to];
1718 last_rule = state->rule_no;
1721 else if (state->rule_nno)
1723 last_rule = state->rule_nno;
1735 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1736 const char *context_name)
1738 struct lexContext *lt = spec->context;
1741 spec->stop_flag = 0;
1743 spec->context_stack_top = 0;
1746 if (!strcmp (lt->name, context_name))
1752 logf (LOG_WARN, "cannot find context %s", context_name);
1755 spec->context_stack[spec->context_stack_top] = lt;
1756 spec->d1_stack[spec->d1_level] = NULL;
1761 execAction (spec, lt->initActionList, ptr, &ptr);
1764 execAction (spec, lt->beginActionList, ptr, &ptr);
1765 lexNode (spec, &ptr);
1766 while (spec->d1_level)
1768 tagDataRelease (spec);
1771 execAction (spec, lt->endActionList, ptr, &ptr);
1772 return spec->d1_stack[0];
1775 void grs_destroy(void *clientData)
1777 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1780 lexSpecDestroy(&specs->spec);
1785 void *grs_init(void)
1787 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1792 data1_node *grs_read_regx (struct grs_read_info *p)
1795 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1796 struct lexSpec **curLexSpec = &specs->spec;
1799 logf (LOG_LOG, "grs_read_regx");
1801 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1804 lexSpecDestroy (curLexSpec);
1805 *curLexSpec = lexSpecCreate (p->type, p->dh);
1806 res = readFileSpec (*curLexSpec);
1809 lexSpecDestroy (curLexSpec);
1813 (*curLexSpec)->dh = p->dh;
1816 (*curLexSpec)->f_win_start = 0;
1817 (*curLexSpec)->f_win_end = 0;
1818 (*curLexSpec)->f_win_rf = p->readf;
1819 (*curLexSpec)->f_win_sf = p->seekf;
1820 (*curLexSpec)->f_win_fh = p->fh;
1821 (*curLexSpec)->f_win_ef = p->endf;
1822 (*curLexSpec)->f_win_size = 500000;
1824 (*curLexSpec)->m = p->mem;
1825 return lexRoot (*curLexSpec, p->offset, "main");
1828 static struct recTypeGrs regx_type = {
1835 RecTypeGrs recTypeGrs_regx = ®x_type;
1838 data1_node *grs_read_tcl (struct grs_read_info *p)
1841 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1842 struct lexSpec **curLexSpec = &specs->spec;
1845 logf (LOG_LOG, "grs_read_tcl");
1847 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1849 Tcl_Interp *tcl_interp;
1851 lexSpecDestroy (curLexSpec);
1852 *curLexSpec = lexSpecCreate (p->type, p->dh);
1853 Tcl_FindExecutable("");
1854 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1855 Tcl_Init(tcl_interp);
1856 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1857 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1858 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1859 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1861 res = readFileSpec (*curLexSpec);
1864 lexSpecDestroy (curLexSpec);
1868 (*curLexSpec)->dh = p->dh;
1871 (*curLexSpec)->f_win_start = 0;
1872 (*curLexSpec)->f_win_end = 0;
1873 (*curLexSpec)->f_win_rf = p->readf;
1874 (*curLexSpec)->f_win_sf = p->seekf;
1875 (*curLexSpec)->f_win_fh = p->fh;
1876 (*curLexSpec)->f_win_ef = p->endf;
1877 (*curLexSpec)->f_win_size = 500000;
1879 (*curLexSpec)->m = p->mem;
1880 return lexRoot (*curLexSpec, p->offset, "main");
1883 static struct recTypeGrs tcl_type = {
1890 RecTypeGrs recTypeGrs_tcl = &tcl_type;