2 * Copyright (C) 1994-1998, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.17 1998-07-01 10:13:51 adam
10 * Revision 1.16 1998/06/30 15:15:09 adam
11 * Tags are trimmed: white space removed before- and after the tag.
13 * Revision 1.15 1998/06/30 12:55:45 adam
16 * Revision 1.14 1998/03/05 08:41:00 adam
17 * Implemented rule contexts.
19 * Revision 1.13 1997/12/12 06:33:58 adam
20 * Fixed bug that showed up when multiple filter where used.
21 * Made one routine thread-safe.
23 * Revision 1.12 1997/11/18 10:03:24 adam
24 * Member num_children removed from data1_node.
26 * Revision 1.11 1997/11/06 11:41:01 adam
27 * Implemented "begin variant" for the sgml.regx filter.
29 * Revision 1.10 1997/10/31 12:36:12 adam
30 * Minor change that avoids compiler warning.
32 * Revision 1.9 1997/09/29 09:02:49 adam
33 * Fixed small bug (introduced by previous commit).
35 * Revision 1.8 1997/09/17 12:19:22 adam
36 * Zebra version corresponds to YAZ version 1.4.
37 * Changed Zebra server so that it doesn't depend on global common_resource.
39 * Revision 1.7 1997/07/15 16:33:07 adam
40 * Check for zero length in execData.
42 * Revision 1.6 1997/02/24 10:41:51 adam
43 * Cleanup of code and commented out the "end element-end-record" code.
45 * Revision 1.5 1997/02/19 16:22:33 adam
46 * Fixed "end element" to terminate record in outer-most level.
48 * Revision 1.4 1997/02/12 20:42:58 adam
49 * Changed some log messages.
51 * Revision 1.3 1996/11/08 14:05:33 adam
52 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
54 * Revision 1.2 1996/10/29 14:02:09 adam
55 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
56 * data1_get_tabpath is used.
58 * Revision 1.1 1996/10/11 10:57:30 adam
59 * New module recctrl. Used to manage records (extract/retrieval).
61 * Revision 1.24 1996/06/17 14:25:31 adam
62 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
64 * Revision 1.23 1996/06/04 10:19:00 adam
65 * Minor changes - removed include of ctype.h.
67 * Revision 1.22 1996/06/03 15:23:13 adam
68 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
70 * Revision 1.21 1996/05/14 16:58:38 adam
73 * Revision 1.20 1996/05/01 13:46:36 adam
74 * First work on multiple records in one file.
75 * New option, -offset, to the "unread" command in the filter module.
77 * Revision 1.19 1996/02/12 16:18:20 adam
78 * Yet another bug fix in implementation of unread command.
80 * Revision 1.18 1996/02/12 16:07:54 adam
81 * Bug fix in new unread command.
83 * Revision 1.17 1996/02/12 15:56:11 adam
84 * New code command: unread.
86 * Revision 1.16 1996/01/17 14:57:51 adam
87 * Prototype changed for reader functions in extract/retrieve. File
88 * is identified by 'void *' instead of 'int.
90 * Revision 1.15 1996/01/08 19:15:47 adam
91 * New input filter that works!
93 * Revision 1.14 1996/01/08 09:10:38 adam
94 * Yet another complete rework on this module.
96 * Revision 1.13 1995/12/15 17:21:50 adam
97 * This version is able to set data.formatted_text in data1-nodes.
99 * Revision 1.12 1995/12/15 16:20:10 adam
100 * The filter files (*.flt) are read from the path given by data1_tabpath.
102 * Revision 1.11 1995/12/15 12:35:16 adam
105 * Revision 1.10 1995/12/15 10:35:36 adam
108 * Revision 1.9 1995/12/14 16:38:48 adam
109 * Completely new attempt to make regular expression parsing.
111 * Revision 1.8 1995/12/13 17:16:59 adam
114 * Revision 1.7 1995/12/13 16:51:58 adam
115 * Modified to set last_child in data1_nodes.
116 * Uses destroy handler to free up data text nodes.
118 * Revision 1.6 1995/12/13 13:45:37 quinn
119 * Changed data1 to use nmem.
121 * Revision 1.5 1995/12/11 09:12:52 adam
122 * The rec_get function returns NULL if record doesn't exist - will
123 * happen in the server if the result set records have been deleted since
124 * the creation of the set (i.e. the search).
125 * The server saves a result temporarily if it is 'volatile', i.e. the
126 * set is register dependent.
128 * Revision 1.4 1995/12/05 16:57:40 adam
129 * More work on regular patterns.
131 * Revision 1.3 1995/12/05 09:37:09 adam
132 * One malloc was renamed to xmalloc.
134 * Revision 1.2 1995/12/04 17:59:24 adam
135 * More work on regular expression conversion.
137 * Revision 1.1 1995/12/04 14:25:30 adam
138 * Started work on regular expression parsed input to structured records.
147 #include <zebrautl.h>
153 #define F_WIN_EOF 2000000000
157 #define REGX_PATTERN 1
162 #define REGX_CONTEXT 6
168 struct lexRuleAction {
172 struct DFA *dfa; /* REGX_PATTERN */
175 struct regxCode *code; /* REGX_CODE */
177 struct lexRuleAction *next;
182 struct lexRuleAction *actionList;
186 struct lexRuleInfo info;
187 struct lexRule *next;
193 struct lexRule *rules;
194 struct lexRuleInfo **fastRule;
197 struct lexRuleAction *beginActionList;
198 struct lexRuleAction *endActionList;
199 struct lexContext *next;
204 struct lexContext *context;
206 struct lexContext **context_stack;
207 int context_stack_size;
208 int context_stack_top;
214 void (*f_win_ef)(void *, off_t);
216 int f_win_start; /* first byte of buffer is this file offset */
217 int f_win_end; /* last byte of buffer is this offset - 1 */
218 int f_win_size; /* size of buffer */
219 char *f_win_buf; /* buffer itself */
220 int (*f_win_rf)(void *, char *, size_t);
221 off_t (*f_win_sf)(void *, off_t);
226 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
229 int i, r, off = start_pos - spec->f_win_start;
231 if (off >= 0 && end_pos <= spec->f_win_end)
233 *size = end_pos - start_pos;
234 return spec->f_win_buf + off;
236 if (off < 0 || start_pos >= spec->f_win_end)
238 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
239 spec->f_win_start = start_pos;
241 if (!spec->f_win_buf)
242 spec->f_win_buf = xmalloc (spec->f_win_size);
243 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
245 spec->f_win_end = spec->f_win_start + *size;
247 if (*size > end_pos - start_pos)
248 *size = end_pos - start_pos;
249 return spec->f_win_buf;
251 for (i = 0; i<spec->f_win_end - start_pos; i++)
252 spec->f_win_buf[i] = spec->f_win_buf[i + off];
253 r = (*spec->f_win_rf)(spec->f_win_fh,
255 spec->f_win_size - i);
256 spec->f_win_start = start_pos;
257 spec->f_win_end += r;
259 if (*size > end_pos - start_pos)
260 *size = end_pos - start_pos;
261 return spec->f_win_buf;
264 static int f_win_advance (struct lexSpec *spec, int *pos)
269 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
270 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
271 if (*pos == F_WIN_EOF)
273 buf = f_win_get (spec, *pos, *pos+1, &size);
283 static void regxCodeDel (struct regxCode **pp)
285 struct regxCode *p = *pp;
294 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
298 p = xmalloc (sizeof(*p));
299 p->str = xmalloc (len+1);
300 memcpy (p->str, buf, len);
305 static struct DFA *lexSpecDFA (void)
310 dfa_parse_cmap_del (dfa, ' ');
311 dfa_parse_cmap_del (dfa, '\t');
312 dfa_parse_cmap_add (dfa, '/', 0);
316 static void actionListDel (struct lexRuleAction **rap)
318 struct lexRuleAction *ra1, *ra;
320 for (ra = *rap; ra; ra = ra1)
326 dfa_delete (&ra->u.pattern.dfa);
329 regxCodeDel (&ra->u.code);
337 static struct lexContext *lexContextCreate (const char *name)
339 struct lexContext *p = xmalloc (sizeof(*p));
341 p->name = xstrdup (name);
343 p->dfa = lexSpecDFA ();
346 p->beginActionList = NULL;
347 p->endActionList = NULL;
352 static void lexContextDestroy (struct lexContext *p)
354 struct lexRule *rp, *rp1;
357 for (rp = p->rules; rp; rp = rp1)
360 actionListDel (&rp->info.actionList);
363 actionListDel (&p->beginActionList);
364 actionListDel (&p->endActionList);
369 static struct lexSpec *lexSpecCreate (const char *name)
373 p = xmalloc (sizeof(*p));
374 p->name = xmalloc (strlen(name)+1);
375 strcpy (p->name, name);
378 p->context_stack_size = 100;
379 p->context_stack = xmalloc (sizeof(*p->context_stack) *
380 p->context_stack_size);
385 static void lexSpecDestroy (struct lexSpec **pp)
388 struct lexContext *lt;
397 struct lexContext *lt_next = lt->next;
398 lexContextDestroy (lt);
402 xfree (p->f_win_buf);
403 xfree (p->context_stack);
408 static int readParseToken (const char **cpp, int *len)
410 const char *cp = *cpp;
414 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
443 if (*cp >= 'a' && *cp <= 'z')
445 else if (*cp >= 'A' && *cp <= 'Z')
446 cmd[i] = *cp + 'a' - 'A';
449 if (i < sizeof(cmd)-2)
456 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
458 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
464 if (!strcmp (cmd, "begin"))
466 else if (!strcmp (cmd, "end"))
468 else if (!strcmp (cmd, "body"))
470 else if (!strcmp (cmd, "context"))
474 logf (LOG_WARN, "bad command %s", cmd);
480 static int actionListMk (struct lexSpec *spec, const char *s,
481 struct lexRuleAction **ap)
487 while ((tok = readParseToken (&s, &len)))
495 *ap = xmalloc (sizeof(**ap));
497 regxCodeMk (&(*ap)->u.code, s, len);
501 *ap = xmalloc (sizeof(**ap));
503 (*ap)->u.pattern.body = bodyMark;
505 (*ap)->u.pattern.dfa = lexSpecDFA ();
507 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
512 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
515 dfa_mkstate ((*ap)->u.pattern.dfa);
519 logf (LOG_WARN, "cannot use begin here");
522 *ap = xmalloc (sizeof(**ap));
532 int readOneSpec (struct lexSpec *spec, const char *s)
536 struct lexContext *lc;
538 tok = readParseToken (&s, &len);
539 if (tok == REGX_CONTEXT)
541 char context_name[32];
542 tok = readParseToken (&s, &len);
543 if (tok != REGX_CODE)
545 logf (LOG_WARN, "missing name after CONTEXT keyword");
550 memcpy (context_name, s, len);
551 context_name[len] = '\0';
552 lc = lexContextCreate (context_name);
553 lc->next = spec->context;
558 spec->context = lexContextCreate ("main");
563 actionListDel (&spec->context->beginActionList);
564 actionListMk (spec, s, &spec->context->beginActionList);
567 actionListDel (&spec->context->endActionList);
568 actionListMk (spec, s, &spec->context->endActionList);
572 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
574 r = dfa_parse (spec->context->dfa, &s);
577 logf (LOG_WARN, "regular expression error. r=%d", r);
582 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
586 rp = xmalloc (sizeof(*rp));
587 rp->info.no = spec->context->ruleNo++;
588 rp->next = spec->context->rules;
589 spec->context->rules = rp;
590 actionListMk (spec, s, &rp->info.actionList);
595 int readFileSpec (struct lexSpec *spec)
597 struct lexContext *lc;
600 int c, i, errors = 0;
603 lineBuf = xmalloc (1+lineSize);
604 logf (LOG_LOG, "reading regx filter %s.flt", spec->name);
605 sprintf (lineBuf, "%s.flt", spec->name);
606 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
609 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
618 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
620 while (c != '\n' && c != EOF)
639 if (c != ' ' && c != '\t')
648 readOneSpec (spec, lineBuf);
649 spec->lineNo += addLine;
658 debug_dfa_followpos = 1;
661 for (lc = spec->context; lc; lc = lc->next)
664 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
665 for (i = 0; i < lc->ruleNo; i++)
666 lc->fastRule[i] = NULL;
667 for (rp = lc->rules; rp; rp = rp->next)
668 lc->fastRule[rp->info.no] = &rp->info;
669 dfa_mkstate (lc->dfa);
676 static struct lexSpec *curLexSpec = NULL;
678 static void destroy_data (struct data1_node *n)
680 assert (n->which == DATA1N_data);
681 xfree (n->u.data.data);
684 static void execData (struct lexSpec *spec,
685 data1_node **d1_stack, int *d1_level,
686 const char *ebuf, int elen, int formatted_text)
688 struct data1_node *res, *parent;
690 if (elen == 0) /* shouldn't happen, but it does! */
694 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
695 ebuf, 15, ebuf + elen-15);
697 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
699 logf (LOG_DEBUG, "data (%d bytes)", elen);
705 parent = d1_stack[*d1_level -1];
707 if ((res=d1_stack[*d1_level]) && res->which == DATA1N_data)
709 if (elen + res->u.data.len <= DATA1_LOCALDATA)
710 memcpy (res->u.data.data + res->u.data.len, ebuf, elen);
713 char *nb = xmalloc (elen + res->u.data.len);
714 memcpy (nb, res->u.data.data, res->u.data.len);
715 memcpy (nb + res->u.data.len, ebuf, elen);
716 res->u.data.data = nb;
717 res->destroy = destroy_data;
719 res->u.data.len += elen;
723 res = data1_mk_node (spec->dh, spec->m);
724 res->parent = parent;
725 res->which = DATA1N_data;
726 res->u.data.what = DATA1I_text;
727 res->u.data.len = elen;
728 res->u.data.formatted_text = formatted_text;
729 if (elen > DATA1_LOCALDATA)
730 res->u.data.data = nmem_malloc (spec->m, elen);
732 res->u.data.data = res->lbuf;
733 memcpy (res->u.data.data, ebuf, elen);
734 res->root = parent->root;
736 parent->last_child = res;
737 if (d1_stack[*d1_level])
738 d1_stack[*d1_level]->next = res;
741 d1_stack[*d1_level] = res;
745 static void execDataP (struct lexSpec *spec,
746 data1_node **d1_stack, int *d1_level,
747 const char *ebuf, int elen, int formatted_text)
749 execData (spec, d1_stack, d1_level, ebuf, elen, formatted_text);
752 static void variantBegin (struct lexSpec *spec,
753 data1_node **d1_stack, int *d1_level,
754 const char *class_str, int class_len,
755 const char *type_str, int type_len,
756 const char *value_str, int value_len)
758 struct data1_node *parent = d1_stack[*d1_level -1];
759 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
766 logf (LOG_WARN, "in variant begin. No record type defined");
769 if (class_len >= DATA1_MAX_SYMBOL)
770 class_len = DATA1_MAX_SYMBOL-1;
771 memcpy (tclass, class_str, class_len);
772 tclass[class_len] = '\0';
774 if (type_len >= DATA1_MAX_SYMBOL)
775 type_len = DATA1_MAX_SYMBOL-1;
776 memcpy (ttype, type_str, type_len);
777 ttype[type_len] = '\0';
780 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, *d1_level);
784 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
788 if (parent->which != DATA1N_variant)
790 res = data1_mk_node (spec->dh, spec->m);
791 res->parent = parent;
792 res->which = DATA1N_variant;
793 res->u.variant.type = 0;
794 res->u.variant.value = 0;
795 res->root = parent->root;
797 parent->last_child = res;
798 if (d1_stack[*d1_level])
799 d1_stack[*d1_level]->next = res;
802 d1_stack[*d1_level] = res;
803 d1_stack[++(*d1_level)] = NULL;
805 for (i = *d1_level-1; d1_stack[i]->which == DATA1N_variant; i--)
806 if (d1_stack[i]->u.variant.type == tp)
813 logf (LOG_DEBUG, "variant node (%d)", *d1_level);
815 parent = d1_stack[*d1_level-1];
816 res = data1_mk_node (spec->dh, spec->m);
817 res->parent = parent;
818 res->which = DATA1N_variant;
819 res->root = parent->root;
820 res->u.variant.type = tp;
822 if (value_len >= DATA1_LOCALDATA)
823 value_len =DATA1_LOCALDATA-1;
824 memcpy (res->lbuf, value_str, value_len);
825 res->lbuf[value_len] = '\0';
827 res->u.variant.value = res->lbuf;
829 parent->last_child = res;
830 if (d1_stack[*d1_level])
831 d1_stack[*d1_level]->next = res;
834 d1_stack[*d1_level] = res;
835 d1_stack[++(*d1_level)] = NULL;
838 static void tagStrip (const char **tag, int *len)
842 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
845 for (i = 0; i < *len && isspace((*tag)[i]); i++)
851 static void tagBegin (struct lexSpec *spec,
852 data1_node **d1_stack, int *d1_level,
853 const char *tag, int len)
855 struct data1_node *parent = d1_stack[*d1_level -1];
856 data1_element *elem = NULL;
857 data1_node *partag = get_parent_tag(spec->dh, parent);
859 data1_element *e = NULL;
864 logf (LOG_WARN, "in element begin. No record type defined");
867 tagStrip (&tag, &len);
869 res = data1_mk_node (spec->dh, spec->m);
870 res->parent = parent;
871 res->which = DATA1N_tag;
872 res->u.tag.get_bytes = -1;
874 if (len >= DATA1_LOCALDATA)
875 len = DATA1_LOCALDATA-1;
876 memcpy (res->lbuf, tag, len);
877 res->lbuf[len] = '\0';
878 res->u.tag.tag = res->lbuf;
881 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, *d1_level);
883 if (parent->which == DATA1N_variant)
886 if (!(e = partag->u.tag.element))
889 elem = data1_getelementbytagname (spec->dh, d1_stack[0]->u.root.absyn,
891 res->u.tag.element = elem;
892 res->u.tag.node_selected = 0;
893 res->u.tag.make_variantlist = 0;
894 res->u.tag.no_data_requested = 0;
895 res->root = parent->root;
897 parent->last_child = res;
898 if (d1_stack[*d1_level])
899 d1_stack[*d1_level]->next = res;
902 d1_stack[*d1_level] = res;
903 d1_stack[++(*d1_level)] = NULL;
906 static void tagEnd (struct lexSpec *spec,
907 data1_node **d1_stack, int *d1_level,
908 const char *tag, int len)
910 tagStrip (&tag, &len);
911 while (*d1_level > 1)
914 if ((d1_stack[*d1_level]->which == DATA1N_tag) &&
916 (strlen(d1_stack[*d1_level]->u.tag.tag) == (size_t) len &&
917 !memcmp (d1_stack[*d1_level]->u.tag.tag, tag, len))))
921 logf (LOG_DEBUG, "end tag (%d)", *d1_level);
926 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
929 struct DFA_state *state = dfa->states[0];
932 unsigned char c_prev = 0;
933 int ptr = *pptr; /* current pointer */
934 int start_ptr = *pptr; /* first char of match */
935 int last_ptr = 0; /* last char of match */
936 int last_rule = 0; /* rule number of current match */
941 c = f_win_advance (spec, &ptr);
942 if (ptr == F_WIN_EOF)
959 *mptr = start_ptr; /* match starts here */
960 *pptr = last_ptr; /* match end here (+1) */
963 state = dfa->states[0];
968 else if (c >= t->ch[0] && c <= t->ch[1])
970 state = dfa->states[t->to];
975 last_rule = state->rule_no;
980 last_rule = state->rule_nno;
992 static int execTok (struct lexSpec *spec, const char **src,
993 int arg_no, int *arg_start, int *arg_end,
994 const char **tokBuf, int *tokLen)
996 const char *s = *src;
998 while (*s == ' ' || *s == '\t')
1002 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1006 while (*s >= '0' && *s <= '9')
1007 n = n*10 + (*s++ -'0');
1017 *tokBuf = f_win_get (spec, arg_start[n], arg_end[n], tokLen);
1020 else if (*s == '\"')
1023 while (*s && *s != '\"')
1025 *tokLen = s - *tokBuf;
1030 else if (*s == '\n' || *s == ';')
1038 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1040 *tokLen = s - *tokBuf;
1047 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1049 *tokLen = s - *tokBuf;
1055 static char *regxStrz (const char *src, int len, char *str)
1059 memcpy (str, src, len);
1064 static int execCode (struct lexSpec *spec,
1065 int arg_no, int *arg_start, int *arg_end, int *pptr,
1066 struct regxCode *code,
1067 data1_node **d1_stack, int *d1_level)
1069 const char *s = code->str;
1072 const char *cmd_str;
1074 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, &cmd_len);
1081 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1082 &cmd_str, &cmd_len);
1085 p = regxStrz (cmd_str, cmd_len, ptmp);
1086 if (!strcmp (p, "begin"))
1088 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1089 &cmd_str, &cmd_len);
1092 logf (LOG_WARN, "missing keyword after 'begin'");
1095 p = regxStrz (cmd_str, cmd_len, ptmp);
1096 if (!strcmp (p, "record"))
1098 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1099 &cmd_str, &cmd_len);
1104 static char absynName[64];
1109 memcpy (absynName, cmd_str, cmd_len);
1110 absynName[cmd_len] = '\0';
1113 logf (LOG_DEBUG, "begin record %s", absynName);
1115 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1116 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1121 res = data1_mk_node (spec->dh, spec->m);
1122 res->which = DATA1N_root;
1123 res->u.root.type = absynName;
1124 res->u.root.absyn = absyn;
1127 d1_stack[*d1_level] = res;
1128 d1_stack[++(*d1_level)] = NULL;
1131 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1132 &cmd_str, &cmd_len);
1134 else if (!strcmp (p, "element"))
1136 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1137 &cmd_str, &cmd_len);
1140 tagBegin (spec, d1_stack, d1_level, cmd_str, cmd_len);
1141 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1142 &cmd_str, &cmd_len);
1144 else if (!strcmp (p, "variant"))
1147 const char *class_str = NULL;
1149 const char *type_str = NULL;
1151 const char *value_str = NULL;
1152 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1153 &cmd_str, &cmd_len);
1156 class_str = cmd_str;
1157 class_len = cmd_len;
1158 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1159 &cmd_str, &cmd_len);
1165 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1166 &cmd_str, &cmd_len);
1169 value_str = cmd_str;
1170 value_len = cmd_len;
1172 variantBegin (spec, d1_stack, d1_level, class_str, class_len,
1173 type_str, type_len, value_str, value_len);
1176 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1177 &cmd_str, &cmd_len);
1179 else if (!strcmp (p, "context"))
1183 struct lexContext *lc = spec->context;
1184 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1185 &cmd_str, &cmd_len);
1186 p = regxStrz (cmd_str, cmd_len, ptmp);
1188 logf (LOG_DEBUG, "begin context %s", p);
1190 while (lc && strcmp (p, lc->name))
1193 spec->context_stack[++(spec->context_stack_top)] = lc;
1195 logf (LOG_WARN, "unknown context %s", p);
1198 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1199 &cmd_str, &cmd_len);
1203 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1206 else if (!strcmp (p, "end"))
1208 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1209 &cmd_str, &cmd_len);
1212 logf (LOG_WARN, "missing keyword after 'end'");
1215 p = regxStrz (cmd_str, cmd_len, ptmp);
1216 if (!strcmp (p, "record"))
1219 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1220 &cmd_str, &cmd_len);
1222 logf (LOG_DEBUG, "end record");
1226 else if (!strcmp (p, "element"))
1228 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1229 &cmd_str, &cmd_len);
1239 tagEnd (spec, d1_stack, d1_level, cmd_str, cmd_len);
1240 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1241 &cmd_str, &cmd_len);
1244 tagEnd (spec, d1_stack, d1_level, NULL, 0);
1246 else if (!strcmp (p, "context"))
1249 logf (LOG_DEBUG, "end context");
1251 if (spec->context_stack_top)
1252 (spec->context_stack_top)--;
1253 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1254 &cmd_str, &cmd_len);
1257 logf (LOG_WARN, "bad keyword '%s' after end", p);
1259 else if (!strcmp (p, "data"))
1263 const char *element_str = NULL;
1265 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1266 &cmd_str, &cmd_len)) == 3)
1268 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1270 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1272 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1273 &element_str, &element_len);
1278 logf (LOG_WARN, "bad data option: %.*s",
1283 logf (LOG_WARN, "missing data item after data");
1287 tagBegin (spec, d1_stack, d1_level, element_str, element_len);
1290 execData (spec, d1_stack, d1_level, cmd_str, cmd_len,
1292 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1293 &cmd_str, &cmd_len);
1296 tagEnd (spec, d1_stack, d1_level, NULL, 0);
1298 else if (!strcmp (p, "unread"))
1301 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1302 &cmd_str, &cmd_len);
1303 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1305 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1306 &cmd_str, &cmd_len);
1309 logf (LOG_WARN, "missing number after -offset");
1312 p = regxStrz (cmd_str, cmd_len, ptmp);
1314 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1315 &cmd_str, &cmd_len);
1321 logf (LOG_WARN, "missing index after unread command");
1324 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1326 logf (LOG_WARN, "bad index after unread command");
1331 no = *cmd_str - '0';
1334 *pptr = arg_start[no] + offset;
1336 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1337 &cmd_str, &cmd_len);
1339 else if (!strcmp (p, "context"))
1343 struct lexContext *lc = spec->context;
1344 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1345 &cmd_str, &cmd_len);
1346 p = regxStrz (cmd_str, cmd_len, ptmp);
1348 while (lc && strcmp (p, lc->name))
1351 spec->context_stack[spec->context_stack_top] = lc;
1353 logf (LOG_WARN, "unknown context %s", p);
1356 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1357 &cmd_str, &cmd_len);
1361 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1362 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1363 &cmd_str, &cmd_len);
1368 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1370 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str,
1380 * execAction: Execute action specified by 'ap'. Returns 0 if
1381 * the pattern(s) associated by rule and code could be executed
1382 * ok; returns 1 if code couldn't be executed.
1384 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1385 data1_node **d1_stack, int *d1_level,
1386 int start_ptr, int *pptr)
1393 arg_start[0] = start_ptr;
1401 if (ap->u.pattern.body)
1403 arg_start[arg_no] = *pptr;
1404 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1406 arg_end[arg_no] = F_WIN_EOF;
1408 arg_start[arg_no] = F_WIN_EOF;
1409 arg_end[arg_no] = F_WIN_EOF;
1414 arg_end[arg_no] = sptr;
1416 arg_start[arg_no] = sptr;
1417 arg_end[arg_no] = *pptr;
1422 arg_start[arg_no] = *pptr;
1423 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1425 if (sptr != arg_start[arg_no])
1427 arg_end[arg_no] = *pptr;
1432 if (!execCode (spec, arg_no, arg_start, arg_end, pptr,
1433 ap->u.code, d1_stack, d1_level))
1437 arg_start[arg_no] = *pptr;
1438 arg_end[arg_no] = F_WIN_EOF;
1447 static int execRule (struct lexSpec *spec, struct lexContext *context,
1448 data1_node **d1_stack, int *d1_level,
1449 int ruleNo, int start_ptr, int *pptr)
1452 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1454 return execAction (spec, context->fastRule[ruleNo]->actionList,
1455 d1_stack, d1_level, start_ptr, pptr);
1458 data1_node *lexNode (struct lexSpec *spec,
1459 data1_node **d1_stack, int *d1_level, int *ptr)
1461 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1462 struct DFA_state *state = context->dfa->states[0];
1465 unsigned char c_prev = '\n';
1467 int last_rule = 0; /* rule number of current match */
1468 int last_ptr = *ptr; /* last char of match */
1469 int start_ptr = *ptr; /* first char of match */
1470 int skip_ptr = *ptr; /* first char of run */
1474 c = f_win_advance (spec, ptr);
1475 if (*ptr == F_WIN_EOF)
1477 /* end of file met */
1480 /* there was a match */
1481 if (skip_ptr < start_ptr)
1483 /* deal with chars that didn't match */
1486 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1487 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1489 /* restore pointer */
1492 if (!execRule (spec, context, d1_stack, d1_level,
1493 last_rule, start_ptr, ptr))
1495 /* restore skip pointer */
1499 else if (skip_ptr < *ptr)
1501 /* deal with chars that didn't match */
1504 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1505 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1507 if (*ptr == F_WIN_EOF)
1514 { /* no transition for character c ... */
1517 if (skip_ptr < start_ptr)
1519 /* deal with chars that didn't match */
1522 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1523 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1525 /* restore pointer */
1527 if (!execRule (spec, context, d1_stack, d1_level,
1528 last_rule, start_ptr, ptr))
1530 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1533 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1535 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1539 context = spec->context_stack[spec->context_stack_top];
1542 last_ptr = start_ptr = *ptr;
1546 c_prev = f_win_advance (spec, &start_ptr);
1551 c_prev = f_win_advance (spec, &start_ptr);
1554 state = context->dfa->states[0];
1557 else if (c >= t->ch[0] && c <= t->ch[1])
1558 { /* transition ... */
1559 state = context->dfa->states[t->to];
1564 last_rule = state->rule_no;
1567 else if (state->rule_nno)
1569 last_rule = state->rule_nno;
1581 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1582 const char *context_name)
1584 struct lexContext *lt = spec->context;
1585 data1_node *d1_stack[512];
1589 spec->context_stack_top = 0;
1592 if (!strcmp (lt->name, context_name))
1598 logf (LOG_WARN, "cannot find context %s", context_name);
1601 spec->context_stack[spec->context_stack_top] = lt;
1602 d1_stack[d1_level] = NULL;
1603 if (lt->beginActionList)
1604 execAction (spec, lt->beginActionList, d1_stack, &d1_level, 0, &ptr);
1605 lexNode (spec, d1_stack, &d1_level, &ptr);
1606 if (lt->endActionList)
1607 execAction (spec, lt->endActionList, d1_stack, &d1_level, ptr, &ptr);
1611 data1_node *grs_read_regx (struct grs_read_info *p)
1616 logf (LOG_DEBUG, "grs_read_regx");
1618 if (!curLexSpec || strcmp (curLexSpec->name, p->type))
1621 lexSpecDestroy (&curLexSpec);
1622 curLexSpec = lexSpecCreate (p->type);
1623 curLexSpec->dh = p->dh;
1624 res = readFileSpec (curLexSpec);
1627 lexSpecDestroy (&curLexSpec);
1631 curLexSpec->dh = p->dh;
1634 curLexSpec->f_win_start = 0;
1635 curLexSpec->f_win_end = 0;
1636 curLexSpec->f_win_rf = p->readf;
1637 curLexSpec->f_win_sf = p->seekf;
1638 curLexSpec->f_win_fh = p->fh;
1639 curLexSpec->f_win_ef = p->endf;
1640 curLexSpec->f_win_size = 500000;
1642 curLexSpec->m = p->mem;
1643 return lexRoot (curLexSpec, p->offset, "main");