2 * Copyright (C) 1994-1998, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.18 1998-10-15 13:11:47 adam
8 * Added support for option -record for "end element". When specified
9 * end element will mark end-of-record when at outer-level.
11 * Revision 1.17 1998/07/01 10:13:51 adam
14 * Revision 1.16 1998/06/30 15:15:09 adam
15 * Tags are trimmed: white space removed before- and after the tag.
17 * Revision 1.15 1998/06/30 12:55:45 adam
20 * Revision 1.14 1998/03/05 08:41:00 adam
21 * Implemented rule contexts.
23 * Revision 1.13 1997/12/12 06:33:58 adam
24 * Fixed bug that showed up when multiple filter where used.
25 * Made one routine thread-safe.
27 * Revision 1.12 1997/11/18 10:03:24 adam
28 * Member num_children removed from data1_node.
30 * Revision 1.11 1997/11/06 11:41:01 adam
31 * Implemented "begin variant" for the sgml.regx filter.
33 * Revision 1.10 1997/10/31 12:36:12 adam
34 * Minor change that avoids compiler warning.
36 * Revision 1.9 1997/09/29 09:02:49 adam
37 * Fixed small bug (introduced by previous commit).
39 * Revision 1.8 1997/09/17 12:19:22 adam
40 * Zebra version corresponds to YAZ version 1.4.
41 * Changed Zebra server so that it doesn't depend on global common_resource.
43 * Revision 1.7 1997/07/15 16:33:07 adam
44 * Check for zero length in execData.
46 * Revision 1.6 1997/02/24 10:41:51 adam
47 * Cleanup of code and commented out the "end element-end-record" code.
49 * Revision 1.5 1997/02/19 16:22:33 adam
50 * Fixed "end element" to terminate record in outer-most level.
52 * Revision 1.4 1997/02/12 20:42:58 adam
53 * Changed some log messages.
55 * Revision 1.3 1996/11/08 14:05:33 adam
56 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
58 * Revision 1.2 1996/10/29 14:02:09 adam
59 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
60 * data1_get_tabpath is used.
62 * Revision 1.1 1996/10/11 10:57:30 adam
63 * New module recctrl. Used to manage records (extract/retrieval).
65 * Revision 1.24 1996/06/17 14:25:31 adam
66 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
68 * Revision 1.23 1996/06/04 10:19:00 adam
69 * Minor changes - removed include of ctype.h.
71 * Revision 1.22 1996/06/03 15:23:13 adam
72 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
74 * Revision 1.21 1996/05/14 16:58:38 adam
77 * Revision 1.20 1996/05/01 13:46:36 adam
78 * First work on multiple records in one file.
79 * New option, -offset, to the "unread" command in the filter module.
81 * Revision 1.19 1996/02/12 16:18:20 adam
82 * Yet another bug fix in implementation of unread command.
84 * Revision 1.18 1996/02/12 16:07:54 adam
85 * Bug fix in new unread command.
87 * Revision 1.17 1996/02/12 15:56:11 adam
88 * New code command: unread.
90 * Revision 1.16 1996/01/17 14:57:51 adam
91 * Prototype changed for reader functions in extract/retrieve. File
92 * is identified by 'void *' instead of 'int.
94 * Revision 1.15 1996/01/08 19:15:47 adam
95 * New input filter that works!
97 * Revision 1.14 1996/01/08 09:10:38 adam
98 * Yet another complete rework on this module.
100 * Revision 1.13 1995/12/15 17:21:50 adam
101 * This version is able to set data.formatted_text in data1-nodes.
103 * Revision 1.12 1995/12/15 16:20:10 adam
104 * The filter files (*.flt) are read from the path given by data1_tabpath.
106 * Revision 1.11 1995/12/15 12:35:16 adam
109 * Revision 1.10 1995/12/15 10:35:36 adam
112 * Revision 1.9 1995/12/14 16:38:48 adam
113 * Completely new attempt to make regular expression parsing.
115 * Revision 1.8 1995/12/13 17:16:59 adam
118 * Revision 1.7 1995/12/13 16:51:58 adam
119 * Modified to set last_child in data1_nodes.
120 * Uses destroy handler to free up data text nodes.
122 * Revision 1.6 1995/12/13 13:45:37 quinn
123 * Changed data1 to use nmem.
125 * Revision 1.5 1995/12/11 09:12:52 adam
126 * The rec_get function returns NULL if record doesn't exist - will
127 * happen in the server if the result set records have been deleted since
128 * the creation of the set (i.e. the search).
129 * The server saves a result temporarily if it is 'volatile', i.e. the
130 * set is register dependent.
132 * Revision 1.4 1995/12/05 16:57:40 adam
133 * More work on regular patterns.
135 * Revision 1.3 1995/12/05 09:37:09 adam
136 * One malloc was renamed to xmalloc.
138 * Revision 1.2 1995/12/04 17:59:24 adam
139 * More work on regular expression conversion.
141 * Revision 1.1 1995/12/04 14:25:30 adam
142 * Started work on regular expression parsed input to structured records.
151 #include <zebrautl.h>
157 #define F_WIN_EOF 2000000000
161 #define REGX_PATTERN 1
166 #define REGX_CONTEXT 6
172 struct lexRuleAction {
176 struct DFA *dfa; /* REGX_PATTERN */
179 struct regxCode *code; /* REGX_CODE */
181 struct lexRuleAction *next;
186 struct lexRuleAction *actionList;
190 struct lexRuleInfo info;
191 struct lexRule *next;
197 struct lexRule *rules;
198 struct lexRuleInfo **fastRule;
201 struct lexRuleAction *beginActionList;
202 struct lexRuleAction *endActionList;
203 struct lexContext *next;
208 struct lexContext *context;
210 struct lexContext **context_stack;
211 int context_stack_size;
212 int context_stack_top;
218 void (*f_win_ef)(void *, off_t);
220 int f_win_start; /* first byte of buffer is this file offset */
221 int f_win_end; /* last byte of buffer is this offset - 1 */
222 int f_win_size; /* size of buffer */
223 char *f_win_buf; /* buffer itself */
224 int (*f_win_rf)(void *, char *, size_t);
225 off_t (*f_win_sf)(void *, off_t);
230 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
233 int i, r, off = start_pos - spec->f_win_start;
235 if (off >= 0 && end_pos <= spec->f_win_end)
237 *size = end_pos - start_pos;
238 return spec->f_win_buf + off;
240 if (off < 0 || start_pos >= spec->f_win_end)
242 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
243 spec->f_win_start = start_pos;
245 if (!spec->f_win_buf)
246 spec->f_win_buf = xmalloc (spec->f_win_size);
247 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
249 spec->f_win_end = spec->f_win_start + *size;
251 if (*size > end_pos - start_pos)
252 *size = end_pos - start_pos;
253 return spec->f_win_buf;
255 for (i = 0; i<spec->f_win_end - start_pos; i++)
256 spec->f_win_buf[i] = spec->f_win_buf[i + off];
257 r = (*spec->f_win_rf)(spec->f_win_fh,
259 spec->f_win_size - i);
260 spec->f_win_start = start_pos;
261 spec->f_win_end += r;
263 if (*size > end_pos - start_pos)
264 *size = end_pos - start_pos;
265 return spec->f_win_buf;
268 static int f_win_advance (struct lexSpec *spec, int *pos)
273 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
274 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
275 if (*pos == F_WIN_EOF)
277 buf = f_win_get (spec, *pos, *pos+1, &size);
287 static void regxCodeDel (struct regxCode **pp)
289 struct regxCode *p = *pp;
298 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
302 p = xmalloc (sizeof(*p));
303 p->str = xmalloc (len+1);
304 memcpy (p->str, buf, len);
309 static struct DFA *lexSpecDFA (void)
314 dfa_parse_cmap_del (dfa, ' ');
315 dfa_parse_cmap_del (dfa, '\t');
316 dfa_parse_cmap_add (dfa, '/', 0);
320 static void actionListDel (struct lexRuleAction **rap)
322 struct lexRuleAction *ra1, *ra;
324 for (ra = *rap; ra; ra = ra1)
330 dfa_delete (&ra->u.pattern.dfa);
333 regxCodeDel (&ra->u.code);
341 static struct lexContext *lexContextCreate (const char *name)
343 struct lexContext *p = xmalloc (sizeof(*p));
345 p->name = xstrdup (name);
347 p->dfa = lexSpecDFA ();
350 p->beginActionList = NULL;
351 p->endActionList = NULL;
356 static void lexContextDestroy (struct lexContext *p)
358 struct lexRule *rp, *rp1;
361 for (rp = p->rules; rp; rp = rp1)
364 actionListDel (&rp->info.actionList);
367 actionListDel (&p->beginActionList);
368 actionListDel (&p->endActionList);
373 static struct lexSpec *lexSpecCreate (const char *name)
377 p = xmalloc (sizeof(*p));
378 p->name = xmalloc (strlen(name)+1);
379 strcpy (p->name, name);
382 p->context_stack_size = 100;
383 p->context_stack = xmalloc (sizeof(*p->context_stack) *
384 p->context_stack_size);
389 static void lexSpecDestroy (struct lexSpec **pp)
392 struct lexContext *lt;
401 struct lexContext *lt_next = lt->next;
402 lexContextDestroy (lt);
406 xfree (p->f_win_buf);
407 xfree (p->context_stack);
412 static int readParseToken (const char **cpp, int *len)
414 const char *cp = *cpp;
418 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
447 if (*cp >= 'a' && *cp <= 'z')
449 else if (*cp >= 'A' && *cp <= 'Z')
450 cmd[i] = *cp + 'a' - 'A';
453 if (i < sizeof(cmd)-2)
460 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
462 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
468 if (!strcmp (cmd, "begin"))
470 else if (!strcmp (cmd, "end"))
472 else if (!strcmp (cmd, "body"))
474 else if (!strcmp (cmd, "context"))
478 logf (LOG_WARN, "bad command %s", cmd);
484 static int actionListMk (struct lexSpec *spec, const char *s,
485 struct lexRuleAction **ap)
491 while ((tok = readParseToken (&s, &len)))
499 *ap = xmalloc (sizeof(**ap));
501 regxCodeMk (&(*ap)->u.code, s, len);
505 *ap = xmalloc (sizeof(**ap));
507 (*ap)->u.pattern.body = bodyMark;
509 (*ap)->u.pattern.dfa = lexSpecDFA ();
511 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
516 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
519 dfa_mkstate ((*ap)->u.pattern.dfa);
523 logf (LOG_WARN, "cannot use begin here");
526 *ap = xmalloc (sizeof(**ap));
536 int readOneSpec (struct lexSpec *spec, const char *s)
540 struct lexContext *lc;
542 tok = readParseToken (&s, &len);
543 if (tok == REGX_CONTEXT)
545 char context_name[32];
546 tok = readParseToken (&s, &len);
547 if (tok != REGX_CODE)
549 logf (LOG_WARN, "missing name after CONTEXT keyword");
554 memcpy (context_name, s, len);
555 context_name[len] = '\0';
556 lc = lexContextCreate (context_name);
557 lc->next = spec->context;
562 spec->context = lexContextCreate ("main");
567 actionListDel (&spec->context->beginActionList);
568 actionListMk (spec, s, &spec->context->beginActionList);
571 actionListDel (&spec->context->endActionList);
572 actionListMk (spec, s, &spec->context->endActionList);
576 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
578 r = dfa_parse (spec->context->dfa, &s);
581 logf (LOG_WARN, "regular expression error. r=%d", r);
586 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
590 rp = xmalloc (sizeof(*rp));
591 rp->info.no = spec->context->ruleNo++;
592 rp->next = spec->context->rules;
593 spec->context->rules = rp;
594 actionListMk (spec, s, &rp->info.actionList);
599 int readFileSpec (struct lexSpec *spec)
601 struct lexContext *lc;
604 int c, i, errors = 0;
607 lineBuf = xmalloc (1+lineSize);
608 logf (LOG_LOG, "reading regx filter %s.flt", spec->name);
609 sprintf (lineBuf, "%s.flt", spec->name);
610 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
613 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
622 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
624 while (c != '\n' && c != EOF)
643 if (c != ' ' && c != '\t')
652 readOneSpec (spec, lineBuf);
653 spec->lineNo += addLine;
662 debug_dfa_followpos = 1;
665 for (lc = spec->context; lc; lc = lc->next)
668 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
669 for (i = 0; i < lc->ruleNo; i++)
670 lc->fastRule[i] = NULL;
671 for (rp = lc->rules; rp; rp = rp->next)
672 lc->fastRule[rp->info.no] = &rp->info;
673 dfa_mkstate (lc->dfa);
680 static struct lexSpec *curLexSpec = NULL;
682 static void destroy_data (struct data1_node *n)
684 assert (n->which == DATA1N_data);
685 xfree (n->u.data.data);
688 static void execData (struct lexSpec *spec,
689 data1_node **d1_stack, int *d1_level,
690 const char *ebuf, int elen, int formatted_text)
692 struct data1_node *res, *parent;
694 if (elen == 0) /* shouldn't happen, but it does! */
698 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
699 ebuf, 15, ebuf + elen-15);
701 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
703 logf (LOG_DEBUG, "data (%d bytes)", elen);
709 parent = d1_stack[*d1_level -1];
711 if ((res=d1_stack[*d1_level]) && res->which == DATA1N_data)
713 if (elen + res->u.data.len <= DATA1_LOCALDATA)
714 memcpy (res->u.data.data + res->u.data.len, ebuf, elen);
717 char *nb = xmalloc (elen + res->u.data.len);
718 memcpy (nb, res->u.data.data, res->u.data.len);
719 memcpy (nb + res->u.data.len, ebuf, elen);
720 res->u.data.data = nb;
721 res->destroy = destroy_data;
723 res->u.data.len += elen;
727 res = data1_mk_node (spec->dh, spec->m);
728 res->parent = parent;
729 res->which = DATA1N_data;
730 res->u.data.what = DATA1I_text;
731 res->u.data.len = elen;
732 res->u.data.formatted_text = formatted_text;
733 if (elen > DATA1_LOCALDATA)
734 res->u.data.data = nmem_malloc (spec->m, elen);
736 res->u.data.data = res->lbuf;
737 memcpy (res->u.data.data, ebuf, elen);
738 res->root = parent->root;
740 parent->last_child = res;
741 if (d1_stack[*d1_level])
742 d1_stack[*d1_level]->next = res;
745 d1_stack[*d1_level] = res;
749 static void execDataP (struct lexSpec *spec,
750 data1_node **d1_stack, int *d1_level,
751 const char *ebuf, int elen, int formatted_text)
753 execData (spec, d1_stack, d1_level, ebuf, elen, formatted_text);
756 static void variantBegin (struct lexSpec *spec,
757 data1_node **d1_stack, int *d1_level,
758 const char *class_str, int class_len,
759 const char *type_str, int type_len,
760 const char *value_str, int value_len)
762 struct data1_node *parent = d1_stack[*d1_level -1];
763 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
770 logf (LOG_WARN, "in variant begin. No record type defined");
773 if (class_len >= DATA1_MAX_SYMBOL)
774 class_len = DATA1_MAX_SYMBOL-1;
775 memcpy (tclass, class_str, class_len);
776 tclass[class_len] = '\0';
778 if (type_len >= DATA1_MAX_SYMBOL)
779 type_len = DATA1_MAX_SYMBOL-1;
780 memcpy (ttype, type_str, type_len);
781 ttype[type_len] = '\0';
784 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, *d1_level);
788 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
792 if (parent->which != DATA1N_variant)
794 res = data1_mk_node (spec->dh, spec->m);
795 res->parent = parent;
796 res->which = DATA1N_variant;
797 res->u.variant.type = 0;
798 res->u.variant.value = 0;
799 res->root = parent->root;
801 parent->last_child = res;
802 if (d1_stack[*d1_level])
803 d1_stack[*d1_level]->next = res;
806 d1_stack[*d1_level] = res;
807 d1_stack[++(*d1_level)] = NULL;
809 for (i = *d1_level-1; d1_stack[i]->which == DATA1N_variant; i--)
810 if (d1_stack[i]->u.variant.type == tp)
817 logf (LOG_DEBUG, "variant node (%d)", *d1_level);
819 parent = d1_stack[*d1_level-1];
820 res = data1_mk_node (spec->dh, spec->m);
821 res->parent = parent;
822 res->which = DATA1N_variant;
823 res->root = parent->root;
824 res->u.variant.type = tp;
826 if (value_len >= DATA1_LOCALDATA)
827 value_len =DATA1_LOCALDATA-1;
828 memcpy (res->lbuf, value_str, value_len);
829 res->lbuf[value_len] = '\0';
831 res->u.variant.value = res->lbuf;
833 parent->last_child = res;
834 if (d1_stack[*d1_level])
835 d1_stack[*d1_level]->next = res;
838 d1_stack[*d1_level] = res;
839 d1_stack[++(*d1_level)] = NULL;
842 static void tagStrip (const char **tag, int *len)
846 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
849 for (i = 0; i < *len && isspace((*tag)[i]); i++)
855 static void tagBegin (struct lexSpec *spec,
856 data1_node **d1_stack, int *d1_level,
857 const char *tag, int len)
859 struct data1_node *parent = d1_stack[*d1_level -1];
860 data1_element *elem = NULL;
861 data1_node *partag = get_parent_tag(spec->dh, parent);
863 data1_element *e = NULL;
868 logf (LOG_WARN, "in element begin. No record type defined");
871 tagStrip (&tag, &len);
873 res = data1_mk_node (spec->dh, spec->m);
874 res->parent = parent;
875 res->which = DATA1N_tag;
876 res->u.tag.get_bytes = -1;
878 if (len >= DATA1_LOCALDATA)
879 len = DATA1_LOCALDATA-1;
880 memcpy (res->lbuf, tag, len);
881 res->lbuf[len] = '\0';
882 res->u.tag.tag = res->lbuf;
885 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, *d1_level);
887 if (parent->which == DATA1N_variant)
890 if (!(e = partag->u.tag.element))
893 elem = data1_getelementbytagname (spec->dh, d1_stack[0]->u.root.absyn,
895 res->u.tag.element = elem;
896 res->u.tag.node_selected = 0;
897 res->u.tag.make_variantlist = 0;
898 res->u.tag.no_data_requested = 0;
899 res->root = parent->root;
901 parent->last_child = res;
902 if (d1_stack[*d1_level])
903 d1_stack[*d1_level]->next = res;
906 d1_stack[*d1_level] = res;
907 d1_stack[++(*d1_level)] = NULL;
910 static void tagEnd (struct lexSpec *spec,
911 data1_node **d1_stack, int *d1_level, int min_level,
912 const char *tag, int len)
914 tagStrip (&tag, &len);
915 while (*d1_level > min_level)
920 if ((d1_stack[*d1_level]->which == DATA1N_tag) &&
922 (strlen(d1_stack[*d1_level]->u.tag.tag) == (size_t) len &&
923 !memcmp (d1_stack[*d1_level]->u.tag.tag, tag, len))))
927 logf (LOG_DEBUG, "end tag (%d)", *d1_level);
932 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
935 struct DFA_state *state = dfa->states[0];
938 unsigned char c_prev = 0;
939 int ptr = *pptr; /* current pointer */
940 int start_ptr = *pptr; /* first char of match */
941 int last_ptr = 0; /* last char of match */
942 int last_rule = 0; /* rule number of current match */
947 c = f_win_advance (spec, &ptr);
948 if (ptr == F_WIN_EOF)
965 *mptr = start_ptr; /* match starts here */
966 *pptr = last_ptr; /* match end here (+1) */
969 state = dfa->states[0];
974 else if (c >= t->ch[0] && c <= t->ch[1])
976 state = dfa->states[t->to];
981 last_rule = state->rule_no;
986 last_rule = state->rule_nno;
998 static int execTok (struct lexSpec *spec, const char **src,
999 int arg_no, int *arg_start, int *arg_end,
1000 const char **tokBuf, int *tokLen)
1002 const char *s = *src;
1004 while (*s == ' ' || *s == '\t')
1008 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1012 while (*s >= '0' && *s <= '9')
1013 n = n*10 + (*s++ -'0');
1023 *tokBuf = f_win_get (spec, arg_start[n], arg_end[n], tokLen);
1026 else if (*s == '\"')
1029 while (*s && *s != '\"')
1031 *tokLen = s - *tokBuf;
1036 else if (*s == '\n' || *s == ';')
1044 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1046 *tokLen = s - *tokBuf;
1053 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1055 *tokLen = s - *tokBuf;
1061 static char *regxStrz (const char *src, int len, char *str)
1065 memcpy (str, src, len);
1070 static int execCode (struct lexSpec *spec,
1071 int arg_no, int *arg_start, int *arg_end, int *pptr,
1072 struct regxCode *code,
1073 data1_node **d1_stack, int *d1_level)
1075 const char *s = code->str;
1078 const char *cmd_str;
1080 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, &cmd_len);
1087 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1088 &cmd_str, &cmd_len);
1091 p = regxStrz (cmd_str, cmd_len, ptmp);
1092 if (!strcmp (p, "begin"))
1094 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1095 &cmd_str, &cmd_len);
1098 logf (LOG_WARN, "missing keyword after 'begin'");
1101 p = regxStrz (cmd_str, cmd_len, ptmp);
1102 if (!strcmp (p, "record"))
1104 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1105 &cmd_str, &cmd_len);
1110 static char absynName[64];
1115 memcpy (absynName, cmd_str, cmd_len);
1116 absynName[cmd_len] = '\0';
1119 logf (LOG_DEBUG, "begin record %s", absynName);
1121 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1122 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1127 res = data1_mk_node (spec->dh, spec->m);
1128 res->which = DATA1N_root;
1129 res->u.root.type = absynName;
1130 res->u.root.absyn = absyn;
1133 d1_stack[*d1_level] = res;
1134 d1_stack[++(*d1_level)] = NULL;
1137 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1138 &cmd_str, &cmd_len);
1140 else if (!strcmp (p, "element"))
1142 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1143 &cmd_str, &cmd_len);
1146 tagBegin (spec, d1_stack, d1_level, cmd_str, cmd_len);
1147 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1148 &cmd_str, &cmd_len);
1150 else if (!strcmp (p, "variant"))
1153 const char *class_str = NULL;
1155 const char *type_str = NULL;
1157 const char *value_str = NULL;
1158 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1159 &cmd_str, &cmd_len);
1162 class_str = cmd_str;
1163 class_len = cmd_len;
1164 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1165 &cmd_str, &cmd_len);
1171 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1172 &cmd_str, &cmd_len);
1175 value_str = cmd_str;
1176 value_len = cmd_len;
1178 variantBegin (spec, d1_stack, d1_level, class_str, class_len,
1179 type_str, type_len, value_str, value_len);
1182 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1183 &cmd_str, &cmd_len);
1185 else if (!strcmp (p, "context"))
1189 struct lexContext *lc = spec->context;
1190 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1191 &cmd_str, &cmd_len);
1192 p = regxStrz (cmd_str, cmd_len, ptmp);
1194 logf (LOG_DEBUG, "begin context %s", p);
1196 while (lc && strcmp (p, lc->name))
1199 spec->context_stack[++(spec->context_stack_top)] = lc;
1201 logf (LOG_WARN, "unknown context %s", p);
1204 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1205 &cmd_str, &cmd_len);
1209 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1212 else if (!strcmp (p, "end"))
1214 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1215 &cmd_str, &cmd_len);
1218 logf (LOG_WARN, "missing keyword after 'end'");
1221 p = regxStrz (cmd_str, cmd_len, ptmp);
1222 if (!strcmp (p, "record"))
1225 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1226 &cmd_str, &cmd_len);
1228 logf (LOG_DEBUG, "end record");
1232 else if (!strcmp (p, "element"))
1235 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1236 &cmd_str, &cmd_len)) == 3)
1238 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1243 tagEnd (spec, d1_stack, d1_level, min_level,
1245 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1246 &cmd_str, &cmd_len);
1249 tagEnd (spec, d1_stack, d1_level, min_level, NULL, 0);
1253 logf (LOG_DEBUG, "end element end records");
1259 else if (!strcmp (p, "context"))
1262 logf (LOG_DEBUG, "end context");
1264 if (spec->context_stack_top)
1265 (spec->context_stack_top)--;
1266 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1267 &cmd_str, &cmd_len);
1270 logf (LOG_WARN, "bad keyword '%s' after end", p);
1272 else if (!strcmp (p, "data"))
1276 const char *element_str = NULL;
1278 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1279 &cmd_str, &cmd_len)) == 3)
1281 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1283 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1285 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1286 &element_str, &element_len);
1291 logf (LOG_WARN, "bad data option: %.*s",
1296 logf (LOG_WARN, "missing data item after data");
1300 tagBegin (spec, d1_stack, d1_level, element_str, element_len);
1303 execData (spec, d1_stack, d1_level, cmd_str, cmd_len,
1305 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1306 &cmd_str, &cmd_len);
1309 tagEnd (spec, d1_stack, d1_level, 1, NULL, 0);
1311 else if (!strcmp (p, "unread"))
1314 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1315 &cmd_str, &cmd_len);
1316 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1318 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1319 &cmd_str, &cmd_len);
1322 logf (LOG_WARN, "missing number after -offset");
1325 p = regxStrz (cmd_str, cmd_len, ptmp);
1327 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1328 &cmd_str, &cmd_len);
1334 logf (LOG_WARN, "missing index after unread command");
1337 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1339 logf (LOG_WARN, "bad index after unread command");
1344 no = *cmd_str - '0';
1347 *pptr = arg_start[no] + offset;
1349 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1350 &cmd_str, &cmd_len);
1352 else if (!strcmp (p, "context"))
1356 struct lexContext *lc = spec->context;
1357 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1358 &cmd_str, &cmd_len);
1359 p = regxStrz (cmd_str, cmd_len, ptmp);
1361 while (lc && strcmp (p, lc->name))
1364 spec->context_stack[spec->context_stack_top] = lc;
1366 logf (LOG_WARN, "unknown context %s", p);
1369 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1370 &cmd_str, &cmd_len);
1374 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1375 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1376 &cmd_str, &cmd_len);
1381 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1383 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str,
1393 * execAction: Execute action specified by 'ap'. Returns 0 if
1394 * the pattern(s) associated by rule and code could be executed
1395 * ok; returns 1 if code couldn't be executed.
1397 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1398 data1_node **d1_stack, int *d1_level,
1399 int start_ptr, int *pptr)
1406 arg_start[0] = start_ptr;
1414 if (ap->u.pattern.body)
1416 arg_start[arg_no] = *pptr;
1417 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1419 arg_end[arg_no] = F_WIN_EOF;
1421 arg_start[arg_no] = F_WIN_EOF;
1422 arg_end[arg_no] = F_WIN_EOF;
1427 arg_end[arg_no] = sptr;
1429 arg_start[arg_no] = sptr;
1430 arg_end[arg_no] = *pptr;
1435 arg_start[arg_no] = *pptr;
1436 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1438 if (sptr != arg_start[arg_no])
1440 arg_end[arg_no] = *pptr;
1445 if (!execCode (spec, arg_no, arg_start, arg_end, pptr,
1446 ap->u.code, d1_stack, d1_level))
1450 arg_start[arg_no] = *pptr;
1451 arg_end[arg_no] = F_WIN_EOF;
1460 static int execRule (struct lexSpec *spec, struct lexContext *context,
1461 data1_node **d1_stack, int *d1_level,
1462 int ruleNo, int start_ptr, int *pptr)
1465 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1467 return execAction (spec, context->fastRule[ruleNo]->actionList,
1468 d1_stack, d1_level, start_ptr, pptr);
1471 data1_node *lexNode (struct lexSpec *spec,
1472 data1_node **d1_stack, int *d1_level, int *ptr)
1474 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1475 struct DFA_state *state = context->dfa->states[0];
1478 unsigned char c_prev = '\n';
1480 int last_rule = 0; /* rule number of current match */
1481 int last_ptr = *ptr; /* last char of match */
1482 int start_ptr = *ptr; /* first char of match */
1483 int skip_ptr = *ptr; /* first char of run */
1487 c = f_win_advance (spec, ptr);
1488 if (*ptr == F_WIN_EOF)
1490 /* end of file met */
1493 /* there was a match */
1494 if (skip_ptr < start_ptr)
1496 /* deal with chars that didn't match */
1499 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1500 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1502 /* restore pointer */
1505 if (!execRule (spec, context, d1_stack, d1_level,
1506 last_rule, start_ptr, ptr))
1508 /* restore skip pointer */
1512 else if (skip_ptr < *ptr)
1514 /* deal with chars that didn't match */
1517 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1518 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1520 if (*ptr == F_WIN_EOF)
1527 { /* no transition for character c ... */
1530 if (skip_ptr < start_ptr)
1532 /* deal with chars that didn't match */
1535 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1536 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1538 /* restore pointer */
1540 if (!execRule (spec, context, d1_stack, d1_level,
1541 last_rule, start_ptr, ptr))
1543 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1546 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1548 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1552 context = spec->context_stack[spec->context_stack_top];
1555 last_ptr = start_ptr = *ptr;
1559 c_prev = f_win_advance (spec, &start_ptr);
1564 c_prev = f_win_advance (spec, &start_ptr);
1567 state = context->dfa->states[0];
1570 else if (c >= t->ch[0] && c <= t->ch[1])
1571 { /* transition ... */
1572 state = context->dfa->states[t->to];
1577 last_rule = state->rule_no;
1580 else if (state->rule_nno)
1582 last_rule = state->rule_nno;
1594 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1595 const char *context_name)
1597 struct lexContext *lt = spec->context;
1598 data1_node *d1_stack[512];
1602 spec->context_stack_top = 0;
1605 if (!strcmp (lt->name, context_name))
1611 logf (LOG_WARN, "cannot find context %s", context_name);
1614 spec->context_stack[spec->context_stack_top] = lt;
1615 d1_stack[d1_level] = NULL;
1616 if (lt->beginActionList)
1617 execAction (spec, lt->beginActionList, d1_stack, &d1_level, 0, &ptr);
1618 lexNode (spec, d1_stack, &d1_level, &ptr);
1619 if (lt->endActionList)
1620 execAction (spec, lt->endActionList, d1_stack, &d1_level, ptr, &ptr);
1624 data1_node *grs_read_regx (struct grs_read_info *p)
1629 logf (LOG_DEBUG, "grs_read_regx");
1631 if (!curLexSpec || strcmp (curLexSpec->name, p->type))
1634 lexSpecDestroy (&curLexSpec);
1635 curLexSpec = lexSpecCreate (p->type);
1636 curLexSpec->dh = p->dh;
1637 res = readFileSpec (curLexSpec);
1640 lexSpecDestroy (&curLexSpec);
1644 curLexSpec->dh = p->dh;
1647 curLexSpec->f_win_start = 0;
1648 curLexSpec->f_win_end = 0;
1649 curLexSpec->f_win_rf = p->readf;
1650 curLexSpec->f_win_sf = p->seekf;
1651 curLexSpec->f_win_fh = p->fh;
1652 curLexSpec->f_win_ef = p->endf;
1653 curLexSpec->f_win_size = 500000;
1655 curLexSpec->m = p->mem;
1656 return lexRoot (curLexSpec, p->offset, "main");