2 * Copyright (C) 1994-1998, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.22 1998-11-03 16:07:13 adam
10 * Revision 1.21 1998/11/03 15:43:39 adam
11 * Fixed bug introduced by previous commit.
13 * Revision 1.20 1998/11/03 14:51:28 adam
14 * Changed code so that it creates as few data1 nodes as possible.
16 * Revision 1.19 1998/11/03 10:22:39 adam
17 * Fixed memory leak that could occur for when large data1 node were
18 * concatenated. Data-type data1_nodes may have multiple nodes.
20 * Revision 1.18 1998/10/15 13:11:47 adam
21 * Added support for option -record for "end element". When specified
22 * end element will mark end-of-record when at outer-level.
24 * Revision 1.17 1998/07/01 10:13:51 adam
27 * Revision 1.16 1998/06/30 15:15:09 adam
28 * Tags are trimmed: white space removed before- and after the tag.
30 * Revision 1.15 1998/06/30 12:55:45 adam
33 * Revision 1.14 1998/03/05 08:41:00 adam
34 * Implemented rule contexts.
36 * Revision 1.13 1997/12/12 06:33:58 adam
37 * Fixed bug that showed up when multiple filter where used.
38 * Made one routine thread-safe.
40 * Revision 1.12 1997/11/18 10:03:24 adam
41 * Member num_children removed from data1_node.
43 * Revision 1.11 1997/11/06 11:41:01 adam
44 * Implemented "begin variant" for the sgml.regx filter.
46 * Revision 1.10 1997/10/31 12:36:12 adam
47 * Minor change that avoids compiler warning.
49 * Revision 1.9 1997/09/29 09:02:49 adam
50 * Fixed small bug (introduced by previous commit).
52 * Revision 1.8 1997/09/17 12:19:22 adam
53 * Zebra version corresponds to YAZ version 1.4.
54 * Changed Zebra server so that it doesn't depend on global common_resource.
56 * Revision 1.7 1997/07/15 16:33:07 adam
57 * Check for zero length in execData.
59 * Revision 1.6 1997/02/24 10:41:51 adam
60 * Cleanup of code and commented out the "end element-end-record" code.
62 * Revision 1.5 1997/02/19 16:22:33 adam
63 * Fixed "end element" to terminate record in outer-most level.
65 * Revision 1.4 1997/02/12 20:42:58 adam
66 * Changed some log messages.
68 * Revision 1.3 1996/11/08 14:05:33 adam
69 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
71 * Revision 1.2 1996/10/29 14:02:09 adam
72 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
73 * data1_get_tabpath is used.
75 * Revision 1.1 1996/10/11 10:57:30 adam
76 * New module recctrl. Used to manage records (extract/retrieval).
78 * Revision 1.24 1996/06/17 14:25:31 adam
79 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
81 * Revision 1.23 1996/06/04 10:19:00 adam
82 * Minor changes - removed include of ctype.h.
84 * Revision 1.22 1996/06/03 15:23:13 adam
85 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
87 * Revision 1.21 1996/05/14 16:58:38 adam
90 * Revision 1.20 1996/05/01 13:46:36 adam
91 * First work on multiple records in one file.
92 * New option, -offset, to the "unread" command in the filter module.
94 * Revision 1.19 1996/02/12 16:18:20 adam
95 * Yet another bug fix in implementation of unread command.
97 * Revision 1.18 1996/02/12 16:07:54 adam
98 * Bug fix in new unread command.
100 * Revision 1.17 1996/02/12 15:56:11 adam
101 * New code command: unread.
103 * Revision 1.16 1996/01/17 14:57:51 adam
104 * Prototype changed for reader functions in extract/retrieve. File
105 * is identified by 'void *' instead of 'int.
107 * Revision 1.15 1996/01/08 19:15:47 adam
108 * New input filter that works!
110 * Revision 1.14 1996/01/08 09:10:38 adam
111 * Yet another complete rework on this module.
113 * Revision 1.13 1995/12/15 17:21:50 adam
114 * This version is able to set data.formatted_text in data1-nodes.
116 * Revision 1.12 1995/12/15 16:20:10 adam
117 * The filter files (*.flt) are read from the path given by data1_tabpath.
119 * Revision 1.11 1995/12/15 12:35:16 adam
122 * Revision 1.10 1995/12/15 10:35:36 adam
125 * Revision 1.9 1995/12/14 16:38:48 adam
126 * Completely new attempt to make regular expression parsing.
128 * Revision 1.8 1995/12/13 17:16:59 adam
131 * Revision 1.7 1995/12/13 16:51:58 adam
132 * Modified to set last_child in data1_nodes.
133 * Uses destroy handler to free up data text nodes.
135 * Revision 1.6 1995/12/13 13:45:37 quinn
136 * Changed data1 to use nmem.
138 * Revision 1.5 1995/12/11 09:12:52 adam
139 * The rec_get function returns NULL if record doesn't exist - will
140 * happen in the server if the result set records have been deleted since
141 * the creation of the set (i.e. the search).
142 * The server saves a result temporarily if it is 'volatile', i.e. the
143 * set is register dependent.
145 * Revision 1.4 1995/12/05 16:57:40 adam
146 * More work on regular patterns.
148 * Revision 1.3 1995/12/05 09:37:09 adam
149 * One malloc was renamed to xmalloc.
151 * Revision 1.2 1995/12/04 17:59:24 adam
152 * More work on regular expression conversion.
154 * Revision 1.1 1995/12/04 14:25:30 adam
155 * Started work on regular expression parsed input to structured records.
164 #include <zebrautl.h>
170 #define F_WIN_EOF 2000000000
174 #define REGX_PATTERN 1
179 #define REGX_CONTEXT 6
185 struct lexRuleAction {
189 struct DFA *dfa; /* REGX_PATTERN */
192 struct regxCode *code; /* REGX_CODE */
194 struct lexRuleAction *next;
199 struct lexRuleAction *actionList;
203 struct lexRuleInfo info;
204 struct lexRule *next;
210 struct lexRule *rules;
211 struct lexRuleInfo **fastRule;
214 struct lexRuleAction *beginActionList;
215 struct lexRuleAction *endActionList;
216 struct lexContext *next;
219 struct lexConcatBuf {
228 struct lexContext *context;
230 struct lexContext **context_stack;
231 int context_stack_size;
232 int context_stack_top;
238 void (*f_win_ef)(void *, off_t);
240 int f_win_start; /* first byte of buffer is this file offset */
241 int f_win_end; /* last byte of buffer is this offset - 1 */
242 int f_win_size; /* size of buffer */
243 char *f_win_buf; /* buffer itself */
244 int (*f_win_rf)(void *, char *, size_t);
245 off_t (*f_win_sf)(void *, off_t);
247 struct lexConcatBuf **concatBuf;
252 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
255 int i, r, off = start_pos - spec->f_win_start;
257 if (off >= 0 && end_pos <= spec->f_win_end)
259 *size = end_pos - start_pos;
260 return spec->f_win_buf + off;
262 if (off < 0 || start_pos >= spec->f_win_end)
264 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
265 spec->f_win_start = start_pos;
267 if (!spec->f_win_buf)
268 spec->f_win_buf = xmalloc (spec->f_win_size);
269 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
271 spec->f_win_end = spec->f_win_start + *size;
273 if (*size > end_pos - start_pos)
274 *size = end_pos - start_pos;
275 return spec->f_win_buf;
277 for (i = 0; i<spec->f_win_end - start_pos; i++)
278 spec->f_win_buf[i] = spec->f_win_buf[i + off];
279 r = (*spec->f_win_rf)(spec->f_win_fh,
281 spec->f_win_size - i);
282 spec->f_win_start = start_pos;
283 spec->f_win_end += r;
285 if (*size > end_pos - start_pos)
286 *size = end_pos - start_pos;
287 return spec->f_win_buf;
290 static int f_win_advance (struct lexSpec *spec, int *pos)
295 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
296 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
297 if (*pos == F_WIN_EOF)
299 buf = f_win_get (spec, *pos, *pos+1, &size);
309 static void regxCodeDel (struct regxCode **pp)
311 struct regxCode *p = *pp;
320 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
324 p = xmalloc (sizeof(*p));
325 p->str = xmalloc (len+1);
326 memcpy (p->str, buf, len);
331 static struct DFA *lexSpecDFA (void)
336 dfa_parse_cmap_del (dfa, ' ');
337 dfa_parse_cmap_del (dfa, '\t');
338 dfa_parse_cmap_add (dfa, '/', 0);
342 static void actionListDel (struct lexRuleAction **rap)
344 struct lexRuleAction *ra1, *ra;
346 for (ra = *rap; ra; ra = ra1)
352 dfa_delete (&ra->u.pattern.dfa);
355 regxCodeDel (&ra->u.code);
363 static struct lexContext *lexContextCreate (const char *name)
365 struct lexContext *p = xmalloc (sizeof(*p));
367 p->name = xstrdup (name);
369 p->dfa = lexSpecDFA ();
372 p->beginActionList = NULL;
373 p->endActionList = NULL;
378 static void lexContextDestroy (struct lexContext *p)
380 struct lexRule *rp, *rp1;
383 for (rp = p->rules; rp; rp = rp1)
386 actionListDel (&rp->info.actionList);
389 actionListDel (&p->beginActionList);
390 actionListDel (&p->endActionList);
395 static struct lexSpec *lexSpecCreate (const char *name)
400 p = xmalloc (sizeof(*p));
401 p->name = xmalloc (strlen(name)+1);
402 strcpy (p->name, name);
405 p->context_stack_size = 100;
406 p->context_stack = xmalloc (sizeof(*p->context_stack) *
407 p->context_stack_size);
411 p->concatBuf = xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
412 for (i = 0; i < p->maxLevel; i++)
414 p->concatBuf[i] = xmalloc (sizeof(**p->concatBuf));
415 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
416 p->concatBuf[i]->buf = 0;
421 static void lexSpecDestroy (struct lexSpec **pp)
424 struct lexContext *lt;
432 for (i = 0; i < p->maxLevel; i++)
433 xfree (p->concatBuf[i]);
434 xfree (p->concatBuf);
439 struct lexContext *lt_next = lt->next;
440 lexContextDestroy (lt);
444 xfree (p->f_win_buf);
445 xfree (p->context_stack);
450 static int readParseToken (const char **cpp, int *len)
452 const char *cp = *cpp;
456 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
485 if (*cp >= 'a' && *cp <= 'z')
487 else if (*cp >= 'A' && *cp <= 'Z')
488 cmd[i] = *cp + 'a' - 'A';
491 if (i < sizeof(cmd)-2)
498 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
500 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
506 if (!strcmp (cmd, "begin"))
508 else if (!strcmp (cmd, "end"))
510 else if (!strcmp (cmd, "body"))
512 else if (!strcmp (cmd, "context"))
516 logf (LOG_WARN, "bad command %s", cmd);
522 static int actionListMk (struct lexSpec *spec, const char *s,
523 struct lexRuleAction **ap)
529 while ((tok = readParseToken (&s, &len)))
537 *ap = xmalloc (sizeof(**ap));
539 regxCodeMk (&(*ap)->u.code, s, len);
543 *ap = xmalloc (sizeof(**ap));
545 (*ap)->u.pattern.body = bodyMark;
547 (*ap)->u.pattern.dfa = lexSpecDFA ();
549 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
554 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
557 dfa_mkstate ((*ap)->u.pattern.dfa);
561 logf (LOG_WARN, "cannot use begin here");
564 *ap = xmalloc (sizeof(**ap));
574 int readOneSpec (struct lexSpec *spec, const char *s)
578 struct lexContext *lc;
580 tok = readParseToken (&s, &len);
581 if (tok == REGX_CONTEXT)
583 char context_name[32];
584 tok = readParseToken (&s, &len);
585 if (tok != REGX_CODE)
587 logf (LOG_WARN, "missing name after CONTEXT keyword");
592 memcpy (context_name, s, len);
593 context_name[len] = '\0';
594 lc = lexContextCreate (context_name);
595 lc->next = spec->context;
600 spec->context = lexContextCreate ("main");
605 actionListDel (&spec->context->beginActionList);
606 actionListMk (spec, s, &spec->context->beginActionList);
609 actionListDel (&spec->context->endActionList);
610 actionListMk (spec, s, &spec->context->endActionList);
614 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
616 r = dfa_parse (spec->context->dfa, &s);
619 logf (LOG_WARN, "regular expression error. r=%d", r);
624 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
628 rp = xmalloc (sizeof(*rp));
629 rp->info.no = spec->context->ruleNo++;
630 rp->next = spec->context->rules;
631 spec->context->rules = rp;
632 actionListMk (spec, s, &rp->info.actionList);
637 int readFileSpec (struct lexSpec *spec)
639 struct lexContext *lc;
642 int c, i, errors = 0;
645 lineBuf = xmalloc (1+lineSize);
646 logf (LOG_LOG, "reading regx filter %s.flt", spec->name);
647 sprintf (lineBuf, "%s.flt", spec->name);
648 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
651 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
660 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
662 while (c != '\n' && c != EOF)
681 if (c != ' ' && c != '\t')
690 readOneSpec (spec, lineBuf);
691 spec->lineNo += addLine;
700 debug_dfa_followpos = 1;
703 for (lc = spec->context; lc; lc = lc->next)
706 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
707 for (i = 0; i < lc->ruleNo; i++)
708 lc->fastRule[i] = NULL;
709 for (rp = lc->rules; rp; rp = rp->next)
710 lc->fastRule[rp->info.no] = &rp->info;
711 dfa_mkstate (lc->dfa);
718 static struct lexSpec *curLexSpec = NULL;
720 static void execData (struct lexSpec *spec,
721 data1_node **d1_stack, int *d1_level,
722 const char *ebuf, int elen, int formatted_text)
724 struct data1_node *res, *parent;
727 if (elen == 0) /* shouldn't happen, but it does! */
731 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
732 ebuf, 15, ebuf + elen-15);
734 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
736 logf (LOG_DEBUG, "data (%d bytes)", elen);
742 parent = d1_stack[*d1_level -1];
745 if ((res = d1_stack[*d1_level]) && res->which == DATA1N_data)
746 org_len = res->u.data.len;
751 res = data1_mk_node (spec->dh, spec->m);
752 res->parent = parent;
753 res->which = DATA1N_data;
754 res->u.data.what = DATA1I_text;
756 res->u.data.formatted_text = formatted_text;
758 if (elen > DATA1_LOCALDATA)
759 res->u.data.data = nmem_malloc (spec->m, elen);
761 res->u.data.data = res->lbuf;
762 memcpy (res->u.data.data, ebuf, elen);
764 res->u.data.data = 0;
766 res->root = parent->root;
768 parent->last_child = res;
769 if (d1_stack[*d1_level])
770 d1_stack[*d1_level]->next = res;
773 d1_stack[*d1_level] = res;
775 if (org_len + elen >= spec->concatBuf[*d1_level]->max)
777 char *old_buf, *new_buf;
779 spec->concatBuf[*d1_level]->max = org_len + elen + 256;
780 new_buf = xmalloc (spec->concatBuf[*d1_level]->max);
781 if ((old_buf = spec->concatBuf[*d1_level]->buf))
783 memcpy (new_buf, old_buf, org_len);
786 spec->concatBuf[*d1_level]->buf = new_buf;
788 assert (spec->concatBuf[*d1_level]);
789 memcpy (spec->concatBuf[*d1_level]->buf + org_len, ebuf, elen);
790 res->u.data.len += elen;
793 static void execDataP (struct lexSpec *spec,
794 data1_node **d1_stack, int *d1_level,
795 const char *ebuf, int elen, int formatted_text)
797 execData (spec, d1_stack, d1_level, ebuf, elen, formatted_text);
800 static void tagDataRelease (struct lexSpec *spec,
801 data1_node **d1_stack, int d1_level)
805 if ((res = d1_stack[d1_level]) &&
806 res->which == DATA1N_data &&
807 res->u.data.what == DATA1I_text)
809 assert (!res->u.data.data);
810 assert (res->u.data.len > 0);
811 if (res->u.data.len > DATA1_LOCALDATA)
812 res->u.data.data = nmem_malloc (spec->m, res->u.data.len);
814 res->u.data.data = res->lbuf;
815 memcpy (res->u.data.data, spec->concatBuf[d1_level]->buf,
820 static void variantBegin (struct lexSpec *spec,
821 data1_node **d1_stack, int *d1_level,
822 const char *class_str, int class_len,
823 const char *type_str, int type_len,
824 const char *value_str, int value_len)
826 struct data1_node *parent = d1_stack[*d1_level -1];
827 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
834 logf (LOG_WARN, "in variant begin. No record type defined");
837 if (class_len >= DATA1_MAX_SYMBOL)
838 class_len = DATA1_MAX_SYMBOL-1;
839 memcpy (tclass, class_str, class_len);
840 tclass[class_len] = '\0';
842 if (type_len >= DATA1_MAX_SYMBOL)
843 type_len = DATA1_MAX_SYMBOL-1;
844 memcpy (ttype, type_str, type_len);
845 ttype[type_len] = '\0';
848 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, *d1_level);
852 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
856 if (parent->which != DATA1N_variant)
858 res = data1_mk_node (spec->dh, spec->m);
859 res->parent = parent;
860 res->which = DATA1N_variant;
861 res->u.variant.type = 0;
862 res->u.variant.value = 0;
863 res->root = parent->root;
865 parent->last_child = res;
866 if (d1_stack[*d1_level])
868 tagDataRelease (spec, d1_stack, *d1_level);
869 d1_stack[*d1_level]->next = res;
873 d1_stack[*d1_level] = res;
874 d1_stack[++(*d1_level)] = NULL;
876 for (i = *d1_level-1; d1_stack[i]->which == DATA1N_variant; i--)
877 if (d1_stack[i]->u.variant.type == tp)
884 logf (LOG_DEBUG, "variant node (%d)", *d1_level);
886 parent = d1_stack[*d1_level-1];
887 res = data1_mk_node (spec->dh, spec->m);
888 res->parent = parent;
889 res->which = DATA1N_variant;
890 res->root = parent->root;
891 res->u.variant.type = tp;
893 if (value_len >= DATA1_LOCALDATA)
894 value_len =DATA1_LOCALDATA-1;
895 memcpy (res->lbuf, value_str, value_len);
896 res->lbuf[value_len] = '\0';
898 res->u.variant.value = res->lbuf;
900 parent->last_child = res;
901 if (d1_stack[*d1_level])
903 tagDataRelease (spec, d1_stack, *d1_level);
904 d1_stack[*d1_level]->next = res;
908 d1_stack[*d1_level] = res;
909 d1_stack[++(*d1_level)] = NULL;
912 static void tagStrip (const char **tag, int *len)
916 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
919 for (i = 0; i < *len && isspace((*tag)[i]); i++)
925 static void tagBegin (struct lexSpec *spec,
926 data1_node **d1_stack, int *d1_level,
927 const char *tag, int len)
929 struct data1_node *parent = d1_stack[*d1_level -1];
930 data1_element *elem = NULL;
931 data1_node *partag = get_parent_tag(spec->dh, parent);
933 data1_element *e = NULL;
938 logf (LOG_WARN, "in element begin. No record type defined");
941 tagStrip (&tag, &len);
943 res = data1_mk_node (spec->dh, spec->m);
944 res->parent = parent;
945 res->which = DATA1N_tag;
946 res->u.tag.get_bytes = -1;
948 if (len >= DATA1_LOCALDATA)
949 res->u.tag.tag = nmem_malloc (spec->m, len+1);
951 res->u.tag.tag = res->lbuf;
953 memcpy (res->u.tag.tag, tag, len);
954 res->u.tag.tag[len] = '\0';
957 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, *d1_level);
959 if (parent->which == DATA1N_variant)
962 if (!(e = partag->u.tag.element))
965 elem = data1_getelementbytagname (spec->dh, d1_stack[0]->u.root.absyn,
967 res->u.tag.element = elem;
968 res->u.tag.node_selected = 0;
969 res->u.tag.make_variantlist = 0;
970 res->u.tag.no_data_requested = 0;
971 res->root = parent->root;
973 parent->last_child = res;
974 if (d1_stack[*d1_level])
976 tagDataRelease (spec, d1_stack, *d1_level);
977 d1_stack[*d1_level]->next = res;
981 d1_stack[*d1_level] = res;
982 d1_stack[++(*d1_level)] = NULL;
985 static void tagEnd (struct lexSpec *spec,
986 data1_node **d1_stack, int *d1_level, int min_level,
987 const char *tag, int len)
989 tagStrip (&tag, &len);
990 while (*d1_level > min_level)
992 tagDataRelease (spec, d1_stack, *d1_level);
996 if ((d1_stack[*d1_level]->which == DATA1N_tag) &&
998 (strlen(d1_stack[*d1_level]->u.tag.tag) == (size_t) len &&
999 !memcmp (d1_stack[*d1_level]->u.tag.tag, tag, len))))
1003 logf (LOG_DEBUG, "end tag (%d)", *d1_level);
1008 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1011 struct DFA_state *state = dfa->states[0];
1014 unsigned char c_prev = 0;
1015 int ptr = *pptr; /* current pointer */
1016 int start_ptr = *pptr; /* first char of match */
1017 int last_ptr = 0; /* last char of match */
1018 int last_rule = 0; /* rule number of current match */
1023 c = f_win_advance (spec, &ptr);
1024 if (ptr == F_WIN_EOF)
1041 *mptr = start_ptr; /* match starts here */
1042 *pptr = last_ptr; /* match end here (+1) */
1045 state = dfa->states[0];
1050 else if (c >= t->ch[0] && c <= t->ch[1])
1052 state = dfa->states[t->to];
1057 last_rule = state->rule_no;
1062 last_rule = state->rule_nno;
1074 static int execTok (struct lexSpec *spec, const char **src,
1075 int arg_no, int *arg_start, int *arg_end,
1076 const char **tokBuf, int *tokLen)
1078 const char *s = *src;
1080 while (*s == ' ' || *s == '\t')
1084 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1088 while (*s >= '0' && *s <= '9')
1089 n = n*10 + (*s++ -'0');
1099 *tokBuf = f_win_get (spec, arg_start[n], arg_end[n], tokLen);
1102 else if (*s == '\"')
1105 while (*s && *s != '\"')
1107 *tokLen = s - *tokBuf;
1112 else if (*s == '\n' || *s == ';')
1120 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1122 *tokLen = s - *tokBuf;
1129 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1131 *tokLen = s - *tokBuf;
1137 static char *regxStrz (const char *src, int len, char *str)
1141 memcpy (str, src, len);
1146 static int execCode (struct lexSpec *spec,
1147 int arg_no, int *arg_start, int *arg_end, int *pptr,
1148 struct regxCode *code,
1149 data1_node **d1_stack, int *d1_level)
1151 const char *s = code->str;
1154 const char *cmd_str;
1156 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, &cmd_len);
1163 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1164 &cmd_str, &cmd_len);
1167 p = regxStrz (cmd_str, cmd_len, ptmp);
1168 if (!strcmp (p, "begin"))
1170 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1171 &cmd_str, &cmd_len);
1174 logf (LOG_WARN, "missing keyword after 'begin'");
1177 p = regxStrz (cmd_str, cmd_len, ptmp);
1178 if (!strcmp (p, "record"))
1180 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1181 &cmd_str, &cmd_len);
1186 static char absynName[64];
1191 memcpy (absynName, cmd_str, cmd_len);
1192 absynName[cmd_len] = '\0';
1195 logf (LOG_DEBUG, "begin record %s", absynName);
1197 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1198 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1203 res = data1_mk_node (spec->dh, spec->m);
1204 res->which = DATA1N_root;
1205 res->u.root.type = absynName;
1206 res->u.root.absyn = absyn;
1209 d1_stack[*d1_level] = res;
1210 d1_stack[++(*d1_level)] = NULL;
1213 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1214 &cmd_str, &cmd_len);
1216 else if (!strcmp (p, "element"))
1218 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1219 &cmd_str, &cmd_len);
1222 tagBegin (spec, d1_stack, d1_level, cmd_str, cmd_len);
1223 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1224 &cmd_str, &cmd_len);
1226 else if (!strcmp (p, "variant"))
1229 const char *class_str = NULL;
1231 const char *type_str = NULL;
1233 const char *value_str = NULL;
1234 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1235 &cmd_str, &cmd_len);
1238 class_str = cmd_str;
1239 class_len = cmd_len;
1240 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1241 &cmd_str, &cmd_len);
1247 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1248 &cmd_str, &cmd_len);
1251 value_str = cmd_str;
1252 value_len = cmd_len;
1254 variantBegin (spec, d1_stack, d1_level, class_str, class_len,
1255 type_str, type_len, value_str, value_len);
1258 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1259 &cmd_str, &cmd_len);
1261 else if (!strcmp (p, "context"))
1265 struct lexContext *lc = spec->context;
1266 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1267 &cmd_str, &cmd_len);
1268 p = regxStrz (cmd_str, cmd_len, ptmp);
1270 logf (LOG_DEBUG, "begin context %s", p);
1272 while (lc && strcmp (p, lc->name))
1275 spec->context_stack[++(spec->context_stack_top)] = lc;
1277 logf (LOG_WARN, "unknown context %s", p);
1280 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1281 &cmd_str, &cmd_len);
1285 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1288 else if (!strcmp (p, "end"))
1290 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1291 &cmd_str, &cmd_len);
1294 logf (LOG_WARN, "missing keyword after 'end'");
1297 p = regxStrz (cmd_str, cmd_len, ptmp);
1298 if (!strcmp (p, "record"))
1301 for (i = *d1_level; i; --i)
1302 tagDataRelease (spec, d1_stack, i);
1304 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1305 &cmd_str, &cmd_len);
1307 logf (LOG_DEBUG, "end record");
1311 else if (!strcmp (p, "element"))
1314 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1315 &cmd_str, &cmd_len)) == 3)
1317 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1322 tagEnd (spec, d1_stack, d1_level, min_level,
1324 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1325 &cmd_str, &cmd_len);
1328 tagEnd (spec, d1_stack, d1_level, min_level, NULL, 0);
1332 logf (LOG_DEBUG, "end element end records");
1338 else if (!strcmp (p, "context"))
1341 logf (LOG_DEBUG, "end context");
1343 if (spec->context_stack_top)
1344 (spec->context_stack_top)--;
1345 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1346 &cmd_str, &cmd_len);
1349 logf (LOG_WARN, "bad keyword '%s' after end", p);
1351 else if (!strcmp (p, "data"))
1355 const char *element_str = NULL;
1357 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1358 &cmd_str, &cmd_len)) == 3)
1360 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1362 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1364 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1365 &element_str, &element_len);
1370 logf (LOG_WARN, "bad data option: %.*s",
1375 logf (LOG_WARN, "missing data item after data");
1379 tagBegin (spec, d1_stack, d1_level, element_str, element_len);
1382 execData (spec, d1_stack, d1_level, cmd_str, cmd_len,
1384 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1385 &cmd_str, &cmd_len);
1388 tagEnd (spec, d1_stack, d1_level, 1, NULL, 0);
1390 else if (!strcmp (p, "unread"))
1393 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1394 &cmd_str, &cmd_len);
1395 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1397 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1398 &cmd_str, &cmd_len);
1401 logf (LOG_WARN, "missing number after -offset");
1404 p = regxStrz (cmd_str, cmd_len, ptmp);
1406 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1407 &cmd_str, &cmd_len);
1413 logf (LOG_WARN, "missing index after unread command");
1416 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1418 logf (LOG_WARN, "bad index after unread command");
1423 no = *cmd_str - '0';
1426 *pptr = arg_start[no] + offset;
1428 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1429 &cmd_str, &cmd_len);
1431 else if (!strcmp (p, "context"))
1435 struct lexContext *lc = spec->context;
1436 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1437 &cmd_str, &cmd_len);
1438 p = regxStrz (cmd_str, cmd_len, ptmp);
1440 while (lc && strcmp (p, lc->name))
1443 spec->context_stack[spec->context_stack_top] = lc;
1445 logf (LOG_WARN, "unknown context %s", p);
1448 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1449 &cmd_str, &cmd_len);
1453 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1454 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1455 &cmd_str, &cmd_len);
1460 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1462 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str,
1472 * execAction: Execute action specified by 'ap'. Returns 0 if
1473 * the pattern(s) associated by rule and code could be executed
1474 * ok; returns 1 if code couldn't be executed.
1476 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1477 data1_node **d1_stack, int *d1_level,
1478 int start_ptr, int *pptr)
1485 arg_start[0] = start_ptr;
1493 if (ap->u.pattern.body)
1495 arg_start[arg_no] = *pptr;
1496 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1498 arg_end[arg_no] = F_WIN_EOF;
1500 arg_start[arg_no] = F_WIN_EOF;
1501 arg_end[arg_no] = F_WIN_EOF;
1506 arg_end[arg_no] = sptr;
1508 arg_start[arg_no] = sptr;
1509 arg_end[arg_no] = *pptr;
1514 arg_start[arg_no] = *pptr;
1515 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1517 if (sptr != arg_start[arg_no])
1519 arg_end[arg_no] = *pptr;
1524 if (!execCode (spec, arg_no, arg_start, arg_end, pptr,
1525 ap->u.code, d1_stack, d1_level))
1529 arg_start[arg_no] = *pptr;
1530 arg_end[arg_no] = F_WIN_EOF;
1539 static int execRule (struct lexSpec *spec, struct lexContext *context,
1540 data1_node **d1_stack, int *d1_level,
1541 int ruleNo, int start_ptr, int *pptr)
1544 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1546 return execAction (spec, context->fastRule[ruleNo]->actionList,
1547 d1_stack, d1_level, start_ptr, pptr);
1550 data1_node *lexNode (struct lexSpec *spec,
1551 data1_node **d1_stack, int *d1_level, int *ptr)
1553 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1554 struct DFA_state *state = context->dfa->states[0];
1557 unsigned char c_prev = '\n';
1559 int last_rule = 0; /* rule number of current match */
1560 int last_ptr = *ptr; /* last char of match */
1561 int start_ptr = *ptr; /* first char of match */
1562 int skip_ptr = *ptr; /* first char of run */
1566 c = f_win_advance (spec, ptr);
1567 if (*ptr == F_WIN_EOF)
1569 /* end of file met */
1572 /* there was a match */
1573 if (skip_ptr < start_ptr)
1575 /* deal with chars that didn't match */
1578 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1579 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1581 /* restore pointer */
1584 if (!execRule (spec, context, d1_stack, d1_level,
1585 last_rule, start_ptr, ptr))
1587 /* restore skip pointer */
1591 else if (skip_ptr < *ptr)
1593 /* deal with chars that didn't match */
1596 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1597 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1599 if (*ptr == F_WIN_EOF)
1606 { /* no transition for character c ... */
1609 if (skip_ptr < start_ptr)
1611 /* deal with chars that didn't match */
1614 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1615 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1617 /* restore pointer */
1619 if (!execRule (spec, context, d1_stack, d1_level,
1620 last_rule, start_ptr, ptr))
1622 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1625 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1627 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1631 context = spec->context_stack[spec->context_stack_top];
1634 last_ptr = start_ptr = *ptr;
1638 c_prev = f_win_advance (spec, &start_ptr);
1643 c_prev = f_win_advance (spec, &start_ptr);
1646 state = context->dfa->states[0];
1649 else if (c >= t->ch[0] && c <= t->ch[1])
1650 { /* transition ... */
1651 state = context->dfa->states[t->to];
1656 last_rule = state->rule_no;
1659 else if (state->rule_nno)
1661 last_rule = state->rule_nno;
1673 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1674 const char *context_name)
1676 struct lexContext *lt = spec->context;
1677 data1_node *d1_stack[128];
1679 int i, ptr = offset;
1681 spec->context_stack_top = 0;
1684 if (!strcmp (lt->name, context_name))
1690 logf (LOG_WARN, "cannot find context %s", context_name);
1693 spec->context_stack[spec->context_stack_top] = lt;
1694 d1_stack[d1_level] = NULL;
1695 if (lt->beginActionList)
1696 execAction (spec, lt->beginActionList, d1_stack, &d1_level, 0, &ptr);
1697 lexNode (spec, d1_stack, &d1_level, &ptr);
1698 for (i = d1_level; i; --i)
1699 tagDataRelease (spec, d1_stack, i);
1700 if (lt->endActionList)
1701 execAction (spec, lt->endActionList, d1_stack, &d1_level, ptr, &ptr);
1705 data1_node *grs_read_regx (struct grs_read_info *p)
1710 logf (LOG_DEBUG, "grs_read_regx");
1712 if (!curLexSpec || strcmp (curLexSpec->name, p->type))
1715 lexSpecDestroy (&curLexSpec);
1716 curLexSpec = lexSpecCreate (p->type);
1717 curLexSpec->dh = p->dh;
1718 res = readFileSpec (curLexSpec);
1721 lexSpecDestroy (&curLexSpec);
1725 curLexSpec->dh = p->dh;
1728 curLexSpec->f_win_start = 0;
1729 curLexSpec->f_win_end = 0;
1730 curLexSpec->f_win_rf = p->readf;
1731 curLexSpec->f_win_sf = p->seekf;
1732 curLexSpec->f_win_fh = p->fh;
1733 curLexSpec->f_win_ef = p->endf;
1734 curLexSpec->f_win_size = 500000;
1736 curLexSpec->m = p->mem;
1737 return lexRoot (curLexSpec, p->offset, "main");