2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.27 1999-06-28 13:25:40 quinn
8 * Improved diagnostics for Tcl
10 * Revision 1.26 1999/05/26 07:49:14 adam
13 * Revision 1.25 1999/05/25 12:33:32 adam
14 * Fixed bug in Tcl filter.
16 * Revision 1.24 1999/05/21 11:08:46 adam
17 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
18 * script so that it reads uninstalled Tcl source.
20 * Revision 1.23 1999/05/20 12:57:18 adam
21 * Implemented TCL filter. Updated recctrl system.
23 * Revision 1.22 1998/11/03 16:07:13 adam
26 * Revision 1.21 1998/11/03 15:43:39 adam
27 * Fixed bug introduced by previous commit.
29 * Revision 1.20 1998/11/03 14:51:28 adam
30 * Changed code so that it creates as few data1 nodes as possible.
32 * Revision 1.19 1998/11/03 10:22:39 adam
33 * Fixed memory leak that could occur for when large data1 node were
34 * concatenated. Data-type data1_nodes may have multiple nodes.
36 * Revision 1.18 1998/10/15 13:11:47 adam
37 * Added support for option -record for "end element". When specified
38 * end element will mark end-of-record when at outer-level.
40 * Revision 1.17 1998/07/01 10:13:51 adam
43 * Revision 1.16 1998/06/30 15:15:09 adam
44 * Tags are trimmed: white space removed before- and after the tag.
46 * Revision 1.15 1998/06/30 12:55:45 adam
49 * Revision 1.14 1998/03/05 08:41:00 adam
50 * Implemented rule contexts.
52 * Revision 1.13 1997/12/12 06:33:58 adam
53 * Fixed bug that showed up when multiple filter where used.
54 * Made one routine thread-safe.
56 * Revision 1.12 1997/11/18 10:03:24 adam
57 * Member num_children removed from data1_node.
59 * Revision 1.11 1997/11/06 11:41:01 adam
60 * Implemented "begin variant" for the sgml.regx filter.
62 * Revision 1.10 1997/10/31 12:36:12 adam
63 * Minor change that avoids compiler warning.
65 * Revision 1.9 1997/09/29 09:02:49 adam
66 * Fixed small bug (introduced by previous commit).
68 * Revision 1.8 1997/09/17 12:19:22 adam
69 * Zebra version corresponds to YAZ version 1.4.
70 * Changed Zebra server so that it doesn't depend on global common_resource.
72 * Revision 1.7 1997/07/15 16:33:07 adam
73 * Check for zero length in execData.
75 * Revision 1.6 1997/02/24 10:41:51 adam
76 * Cleanup of code and commented out the "end element-end-record" code.
78 * Revision 1.5 1997/02/19 16:22:33 adam
79 * Fixed "end element" to terminate record in outer-most level.
81 * Revision 1.4 1997/02/12 20:42:58 adam
82 * Changed some log messages.
84 * Revision 1.3 1996/11/08 14:05:33 adam
85 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
87 * Revision 1.2 1996/10/29 14:02:09 adam
88 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
89 * data1_get_tabpath is used.
91 * Revision 1.1 1996/10/11 10:57:30 adam
92 * New module recctrl. Used to manage records (extract/retrieval).
94 * Revision 1.24 1996/06/17 14:25:31 adam
95 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
97 * Revision 1.23 1996/06/04 10:19:00 adam
98 * Minor changes - removed include of ctype.h.
100 * Revision 1.22 1996/06/03 15:23:13 adam
101 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
103 * Revision 1.21 1996/05/14 16:58:38 adam
106 * Revision 1.20 1996/05/01 13:46:36 adam
107 * First work on multiple records in one file.
108 * New option, -offset, to the "unread" command in the filter module.
110 * Revision 1.19 1996/02/12 16:18:20 adam
111 * Yet another bug fix in implementation of unread command.
113 * Revision 1.18 1996/02/12 16:07:54 adam
114 * Bug fix in new unread command.
116 * Revision 1.17 1996/02/12 15:56:11 adam
117 * New code command: unread.
119 * Revision 1.16 1996/01/17 14:57:51 adam
120 * Prototype changed for reader functions in extract/retrieve. File
121 * is identified by 'void *' instead of 'int.
123 * Revision 1.15 1996/01/08 19:15:47 adam
124 * New input filter that works!
126 * Revision 1.14 1996/01/08 09:10:38 adam
127 * Yet another complete rework on this module.
129 * Revision 1.13 1995/12/15 17:21:50 adam
130 * This version is able to set data.formatted_text in data1-nodes.
132 * Revision 1.12 1995/12/15 16:20:10 adam
133 * The filter files (*.flt) are read from the path given by data1_tabpath.
135 * Revision 1.11 1995/12/15 12:35:16 adam
138 * Revision 1.10 1995/12/15 10:35:36 adam
141 * Revision 1.9 1995/12/14 16:38:48 adam
142 * Completely new attempt to make regular expression parsing.
144 * Revision 1.8 1995/12/13 17:16:59 adam
147 * Revision 1.7 1995/12/13 16:51:58 adam
148 * Modified to set last_child in data1_nodes.
149 * Uses destroy handler to free up data text nodes.
151 * Revision 1.6 1995/12/13 13:45:37 quinn
152 * Changed data1 to use nmem.
154 * Revision 1.5 1995/12/11 09:12:52 adam
155 * The rec_get function returns NULL if record doesn't exist - will
156 * happen in the server if the result set records have been deleted since
157 * the creation of the set (i.e. the search).
158 * The server saves a result temporarily if it is 'volatile', i.e. the
159 * set is register dependent.
161 * Revision 1.4 1995/12/05 16:57:40 adam
162 * More work on regular patterns.
164 * Revision 1.3 1995/12/05 09:37:09 adam
165 * One malloc was renamed to xmalloc.
167 * Revision 1.2 1995/12/04 17:59:24 adam
168 * More work on regular expression conversion.
170 * Revision 1.1 1995/12/04 14:25:30 adam
171 * Started work on regular expression parsed input to structured records.
180 #include <zebrautl.h>
190 #define F_WIN_EOF 2000000000
194 #define REGX_PATTERN 1
199 #define REGX_CONTEXT 6
206 struct lexRuleAction {
210 struct DFA *dfa; /* REGX_PATTERN */
213 struct regxCode *code; /* REGX_CODE */
215 struct lexRuleAction *next;
220 struct lexRuleAction *actionList;
224 struct lexRuleInfo info;
225 struct lexRule *next;
231 struct lexRule *rules;
232 struct lexRuleInfo **fastRule;
236 struct lexRuleAction *beginActionList;
237 struct lexRuleAction *endActionList;
238 struct lexRuleAction *initActionList;
239 struct lexContext *next;
242 struct lexConcatBuf {
250 struct lexContext *context;
252 struct lexContext **context_stack;
253 int context_stack_size;
254 int context_stack_top;
260 Tcl_Interp *tcl_interp;
263 void (*f_win_ef)(void *, off_t);
265 int f_win_start; /* first byte of buffer is this file offset */
266 int f_win_end; /* last byte of buffer is this offset - 1 */
267 int f_win_size; /* size of buffer */
268 char *f_win_buf; /* buffer itself */
269 int (*f_win_rf)(void *, char *, size_t);
270 off_t (*f_win_sf)(void *, off_t);
272 struct lexConcatBuf **concatBuf;
274 data1_node **d1_stack;
285 struct lexSpec *spec;
288 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
291 int i, r, off = start_pos - spec->f_win_start;
293 if (off >= 0 && end_pos <= spec->f_win_end)
295 *size = end_pos - start_pos;
296 return spec->f_win_buf + off;
298 if (off < 0 || start_pos >= spec->f_win_end)
300 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
301 spec->f_win_start = start_pos;
303 if (!spec->f_win_buf)
304 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
305 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
307 spec->f_win_end = spec->f_win_start + *size;
309 if (*size > end_pos - start_pos)
310 *size = end_pos - start_pos;
311 return spec->f_win_buf;
313 for (i = 0; i<spec->f_win_end - start_pos; i++)
314 spec->f_win_buf[i] = spec->f_win_buf[i + off];
315 r = (*spec->f_win_rf)(spec->f_win_fh,
317 spec->f_win_size - i);
318 spec->f_win_start = start_pos;
319 spec->f_win_end += r;
321 if (*size > end_pos - start_pos)
322 *size = end_pos - start_pos;
323 return spec->f_win_buf;
326 static int f_win_advance (struct lexSpec *spec, int *pos)
331 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
332 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
333 if (*pos == F_WIN_EOF)
335 buf = f_win_get (spec, *pos, *pos+1, &size);
345 static void regxCodeDel (struct regxCode **pp)
347 struct regxCode *p = *pp;
356 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
360 p = (struct regxCode *) xmalloc (sizeof(*p));
361 p->str = (char *) xmalloc (len+1);
362 memcpy (p->str, buf, len);
367 static struct DFA *lexSpecDFA (void)
372 dfa_parse_cmap_del (dfa, ' ');
373 dfa_parse_cmap_del (dfa, '\t');
374 dfa_parse_cmap_add (dfa, '/', 0);
378 static void actionListDel (struct lexRuleAction **rap)
380 struct lexRuleAction *ra1, *ra;
382 for (ra = *rap; ra; ra = ra1)
388 dfa_delete (&ra->u.pattern.dfa);
391 regxCodeDel (&ra->u.code);
399 static struct lexContext *lexContextCreate (const char *name)
401 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
403 p->name = xstrdup (name);
406 p->dfa = lexSpecDFA ();
409 p->beginActionList = NULL;
410 p->endActionList = NULL;
411 p->initActionList = NULL;
416 static void lexContextDestroy (struct lexContext *p)
418 struct lexRule *rp, *rp1;
421 for (rp = p->rules; rp; rp = rp1)
424 actionListDel (&rp->info.actionList);
427 actionListDel (&p->beginActionList);
428 actionListDel (&p->endActionList);
433 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
438 p = (struct lexSpec *) xmalloc (sizeof(*p));
439 p->name = (char *) xmalloc (strlen(name)+1);
440 strcpy (p->name, name);
447 p->context_stack_size = 100;
448 p->context_stack = (struct lexContext **)
449 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
453 p->concatBuf = (struct lexConcatBuf **)
454 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
455 for (i = 0; i < p->maxLevel; i++)
457 p->concatBuf[i] = (struct lexConcatBuf *)
458 xmalloc (sizeof(**p->concatBuf));
459 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
460 p->concatBuf[i]->buf = 0;
462 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
467 static void lexSpecDestroy (struct lexSpec **pp)
470 struct lexContext *lt;
478 for (i = 0; i < p->maxLevel; i++)
479 xfree (p->concatBuf[i]);
480 xfree (p->concatBuf);
485 struct lexContext *lt_next = lt->next;
486 lexContextDestroy (lt);
491 Tcl_DeleteInterp (p->tcl_interp);
494 xfree (p->f_win_buf);
495 xfree (p->context_stack);
501 static int readParseToken (const char **cpp, int *len)
503 const char *cp = *cpp;
507 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
536 if (*cp >= 'a' && *cp <= 'z')
538 else if (*cp >= 'A' && *cp <= 'Z')
539 cmd[i] = *cp + 'a' - 'A';
542 if (i < (int) sizeof(cmd)-2)
549 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
551 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
557 if (!strcmp (cmd, "begin"))
559 else if (!strcmp (cmd, "end"))
561 else if (!strcmp (cmd, "body"))
563 else if (!strcmp (cmd, "context"))
565 else if (!strcmp (cmd, "init"))
569 logf (LOG_WARN, "bad command %s", cmd);
575 static int actionListMk (struct lexSpec *spec, const char *s,
576 struct lexRuleAction **ap)
582 while ((tok = readParseToken (&s, &len)))
590 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
592 regxCodeMk (&(*ap)->u.code, s, len);
596 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
598 (*ap)->u.pattern.body = bodyMark;
600 (*ap)->u.pattern.dfa = lexSpecDFA ();
602 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
607 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
610 dfa_mkstate ((*ap)->u.pattern.dfa);
614 logf (LOG_WARN, "cannot use BEGIN here");
617 logf (LOG_WARN, "cannot use INIT here");
620 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
630 int readOneSpec (struct lexSpec *spec, const char *s)
634 struct lexContext *lc;
636 tok = readParseToken (&s, &len);
637 if (tok == REGX_CONTEXT)
639 char context_name[32];
640 tok = readParseToken (&s, &len);
641 if (tok != REGX_CODE)
643 logf (LOG_WARN, "missing name after CONTEXT keyword");
648 memcpy (context_name, s, len);
649 context_name[len] = '\0';
650 lc = lexContextCreate (context_name);
651 lc->next = spec->context;
656 spec->context = lexContextCreate ("main");
661 actionListDel (&spec->context->beginActionList);
662 actionListMk (spec, s, &spec->context->beginActionList);
665 actionListDel (&spec->context->endActionList);
666 actionListMk (spec, s, &spec->context->endActionList);
669 actionListDel (&spec->context->initActionList);
670 actionListMk (spec, s, &spec->context->initActionList);
674 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
676 r = dfa_parse (spec->context->dfa, &s);
679 logf (LOG_WARN, "regular expression error. r=%d", r);
684 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
688 rp = (struct lexRule *) xmalloc (sizeof(*rp));
689 rp->info.no = spec->context->ruleNo++;
690 rp->next = spec->context->rules;
691 spec->context->rules = rp;
692 actionListMk (spec, s, &rp->info.actionList);
697 int readFileSpec (struct lexSpec *spec)
699 struct lexContext *lc;
702 int c, i, errors = 0;
705 lineBuf = (char *) xmalloc (1+lineSize);
707 if (spec->tcl_interp)
709 sprintf (lineBuf, "%s.tflt", spec->name);
710 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
715 sprintf (lineBuf, "%s.flt", spec->name);
716 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
720 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
724 logf (LOG_LOG, "reading regx filter %s", lineBuf);
726 if (spec->tcl_interp)
727 logf (LOG_LOG, "Tcl enabled");
734 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
736 while (c != '\n' && c != EOF)
755 if (c != ' ' && c != '\t')
764 readOneSpec (spec, lineBuf);
765 spec->lineNo += addLine;
774 debug_dfa_followpos = 1;
777 for (lc = spec->context; lc; lc = lc->next)
780 lc->fastRule = (struct lexRuleInfo **)
781 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
782 for (i = 0; i < lc->ruleNo; i++)
783 lc->fastRule[i] = NULL;
784 for (rp = lc->rules; rp; rp = rp->next)
785 lc->fastRule[rp->info.no] = &rp->info;
786 dfa_mkstate (lc->dfa);
795 static struct lexSpec *curLexSpec = NULL;
798 static void execData (struct lexSpec *spec,
799 const char *ebuf, int elen, int formatted_text)
801 struct data1_node *res, *parent;
804 if (elen == 0) /* shouldn't happen, but it does! */
808 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
809 ebuf, 15, ebuf + elen-15);
811 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
813 logf (LOG_DEBUG, "data (%d bytes)", elen);
816 if (spec->d1_level <= 1)
819 parent = spec->d1_stack[spec->d1_level -1];
822 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
823 org_len = res->u.data.len;
828 res = data1_mk_node (spec->dh, spec->m);
829 res->parent = parent;
830 res->which = DATA1N_data;
831 res->u.data.what = DATA1I_text;
833 res->u.data.formatted_text = formatted_text;
835 if (elen > DATA1_LOCALDATA)
836 res->u.data.data = nmem_malloc (spec->m, elen);
838 res->u.data.data = res->lbuf;
839 memcpy (res->u.data.data, ebuf, elen);
841 res->u.data.data = 0;
843 res->root = parent->root;
845 parent->last_child = res;
846 if (spec->d1_stack[spec->d1_level])
847 spec->d1_stack[spec->d1_level]->next = res;
850 spec->d1_stack[spec->d1_level] = res;
852 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
854 char *old_buf, *new_buf;
856 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
857 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level]->max);
858 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
860 memcpy (new_buf, old_buf, org_len);
863 spec->concatBuf[spec->d1_level]->buf = new_buf;
865 assert (spec->concatBuf[spec->d1_level]);
866 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
867 res->u.data.len += elen;
870 static void execDataP (struct lexSpec *spec,
871 const char *ebuf, int elen, int formatted_text)
873 execData (spec, ebuf, elen, formatted_text);
876 static void tagDataRelease (struct lexSpec *spec)
880 if ((res = spec->d1_stack[spec->d1_level]) &&
881 res->which == DATA1N_data &&
882 res->u.data.what == DATA1I_text)
884 assert (!res->u.data.data);
885 assert (res->u.data.len > 0);
886 if (res->u.data.len > DATA1_LOCALDATA)
887 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
889 res->u.data.data = res->lbuf;
890 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
895 static void variantBegin (struct lexSpec *spec,
896 const char *class_str, int class_len,
897 const char *type_str, int type_len,
898 const char *value_str, int value_len)
900 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
901 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
906 if (spec->d1_level == 0)
908 logf (LOG_WARN, "in variant begin. No record type defined");
911 if (class_len >= DATA1_MAX_SYMBOL)
912 class_len = DATA1_MAX_SYMBOL-1;
913 memcpy (tclass, class_str, class_len);
914 tclass[class_len] = '\0';
916 if (type_len >= DATA1_MAX_SYMBOL)
917 type_len = DATA1_MAX_SYMBOL-1;
918 memcpy (ttype, type_str, type_len);
919 ttype[type_len] = '\0';
922 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
927 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
931 if (parent->which != DATA1N_variant)
933 res = data1_mk_node (spec->dh, spec->m);
934 res->parent = parent;
935 res->which = DATA1N_variant;
936 res->u.variant.type = 0;
937 res->u.variant.value = 0;
938 res->root = parent->root;
940 parent->last_child = res;
941 if (spec->d1_stack[spec->d1_level])
943 tagDataRelease (spec);
944 spec->d1_stack[spec->d1_level]->next = res;
948 spec->d1_stack[spec->d1_level] = res;
949 spec->d1_stack[++(spec->d1_level)] = NULL;
951 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
952 if (spec->d1_stack[i]->u.variant.type == tp)
959 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
961 parent = spec->d1_stack[spec->d1_level-1];
962 res = data1_mk_node (spec->dh, spec->m);
963 res->parent = parent;
964 res->which = DATA1N_variant;
965 res->root = parent->root;
966 res->u.variant.type = tp;
968 if (value_len >= DATA1_LOCALDATA)
969 value_len =DATA1_LOCALDATA-1;
970 memcpy (res->lbuf, value_str, value_len);
971 res->lbuf[value_len] = '\0';
973 res->u.variant.value = res->lbuf;
975 parent->last_child = res;
976 if (spec->d1_stack[spec->d1_level])
978 tagDataRelease (spec);
979 spec->d1_stack[spec->d1_level]->next = res;
983 spec->d1_stack[spec->d1_level] = res;
984 spec->d1_stack[++(spec->d1_level)] = NULL;
987 static void tagStrip (const char **tag, int *len)
991 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
994 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1000 static void tagBegin (struct lexSpec *spec,
1001 const char *tag, int len)
1003 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1004 data1_element *elem = NULL;
1005 data1_node *partag = get_parent_tag(spec->dh, parent);
1007 data1_element *e = NULL;
1010 if (spec->d1_level == 0)
1012 logf (LOG_WARN, "in element begin. No record type defined");
1015 tagStrip (&tag, &len);
1017 res = data1_mk_node (spec->dh, spec->m);
1018 res->parent = parent;
1019 res->which = DATA1N_tag;
1020 res->u.tag.get_bytes = -1;
1022 if (len >= DATA1_LOCALDATA)
1023 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1025 res->u.tag.tag = res->lbuf;
1027 memcpy (res->u.tag.tag, tag, len);
1028 res->u.tag.tag[len] = '\0';
1031 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1033 if (parent->which == DATA1N_variant)
1036 if (!(e = partag->u.tag.element))
1039 elem = data1_getelementbytagname (spec->dh,
1040 spec->d1_stack[0]->u.root.absyn,
1042 res->u.tag.element = elem;
1043 res->u.tag.node_selected = 0;
1044 res->u.tag.make_variantlist = 0;
1045 res->u.tag.no_data_requested = 0;
1046 res->root = parent->root;
1048 parent->last_child = res;
1049 if (spec->d1_stack[spec->d1_level])
1051 tagDataRelease (spec);
1052 spec->d1_stack[spec->d1_level]->next = res;
1055 parent->child = res;
1056 spec->d1_stack[spec->d1_level] = res;
1057 spec->d1_stack[++(spec->d1_level)] = NULL;
1060 static void tagEnd (struct lexSpec *spec, int min_level,
1061 const char *tag, int len)
1063 tagStrip (&tag, &len);
1064 while (spec->d1_level > min_level)
1066 tagDataRelease (spec);
1068 if (spec->d1_level == 0)
1070 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1072 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1074 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1078 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1083 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1086 struct DFA_state *state = dfa->states[0];
1089 unsigned char c_prev = 0;
1090 int ptr = *pptr; /* current pointer */
1091 int start_ptr = *pptr; /* first char of match */
1092 int last_ptr = 0; /* last char of match */
1093 int last_rule = 0; /* rule number of current match */
1098 c = f_win_advance (spec, &ptr);
1099 if (ptr == F_WIN_EOF)
1116 *mptr = start_ptr; /* match starts here */
1117 *pptr = last_ptr; /* match end here (+1) */
1120 state = dfa->states[0];
1125 else if (c >= t->ch[0] && c <= t->ch[1])
1127 state = dfa->states[t->to];
1132 last_rule = state->rule_no;
1137 last_rule = state->rule_nno;
1149 static int execTok (struct lexSpec *spec, const char **src,
1150 const char **tokBuf, int *tokLen)
1152 const char *s = *src;
1154 while (*s == ' ' || *s == '\t')
1158 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1162 while (*s >= '0' && *s <= '9')
1163 n = n*10 + (*s++ -'0');
1164 if (spec->arg_no == 0)
1171 if (n >= spec->arg_no)
1173 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1177 else if (*s == '\"')
1180 while (*s && *s != '\"')
1182 *tokLen = s - *tokBuf;
1187 else if (*s == '\n' || *s == ';')
1195 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1197 *tokLen = s - *tokBuf;
1204 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1206 *tokLen = s - *tokBuf;
1212 static char *regxStrz (const char *src, int len, char *str)
1216 memcpy (str, src, len);
1222 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1223 int argc, char **argv)
1225 struct lexSpec *spec = (struct lexSpec *) clientData;
1228 if (!strcmp(argv[1], "record") && argc == 3)
1230 char *absynName = argv[2];
1234 logf (LOG_DEBUG, "begin record %s", absynName);
1236 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1237 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1242 res = data1_mk_node (spec->dh, spec->m);
1243 res->which = DATA1N_root;
1244 res->u.root.type = absynName;
1245 res->u.root.absyn = absyn;
1248 spec->d1_stack[spec->d1_level] = res;
1249 spec->d1_stack[++(spec->d1_level)] = NULL;
1252 else if (!strcmp(argv[1], "element") && argc == 3)
1254 tagBegin (spec, argv[2], strlen(argv[2]));
1256 else if (!strcmp (argv[1], "variant") && argc == 5)
1258 variantBegin (spec, argv[2], strlen(argv[2]),
1259 argv[3], strlen(argv[3]),
1260 argv[4], strlen(argv[4]));
1262 else if (!strcmp (argv[1], "context") && argc == 3)
1264 struct lexContext *lc = spec->context;
1266 logf (LOG_DEBUG, "begin context %s",argv[2]);
1268 while (lc && strcmp (argv[2], lc->name))
1272 spec->context_stack[++(spec->context_stack_top)] = lc;
1275 logf (LOG_WARN, "unknown context %s", argv[2]);
1282 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1283 int argc, char **argv)
1285 struct lexSpec *spec = (struct lexSpec *) clientData;
1289 if (!strcmp (argv[1], "record"))
1291 while (spec->d1_level)
1293 tagDataRelease (spec);
1297 logf (LOG_DEBUG, "end record");
1299 spec->stop_flag = 1;
1301 else if (!strcmp (argv[1], "element"))
1305 if (argc >= 3 && !strcmp(argv[2], "-record"))
1314 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1315 if (spec->d1_level == 0)
1318 logf (LOG_DEBUG, "end element end records");
1320 spec->stop_flag = 1;
1323 else if (!strcmp (argv[1], "context"))
1326 logf (LOG_DEBUG, "end context");
1328 if (spec->context_stack_top)
1329 (spec->context_stack_top)--;
1336 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1337 int argc, char **argv)
1341 const char *element = 0;
1342 struct lexSpec *spec = (struct lexSpec *) clientData;
1346 if (!strcmp("-text", argv[argi]))
1351 else if (!strcmp("-element", argv[argi]))
1355 element = argv[argi++];
1361 tagBegin (spec, element, strlen(element));
1365 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1369 tagEnd (spec, 1, NULL, 0);
1373 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1374 int argc, char **argv)
1376 struct lexSpec *spec = (struct lexSpec *) clientData;
1383 if (!strcmp("-offset", argv[argi]))
1388 offset = atoi(argv[argi]);
1397 no = atoi(argv[argi]);
1398 if (no >= spec->arg_no)
1399 no = spec->arg_no - 1;
1400 spec->ptr = spec->arg_start[no] + offset;
1404 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1407 for (i = 0; i < spec->arg_no; i++)
1409 char var_name[10], *var_buf;
1412 sprintf (var_name, "%d", i);
1413 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1417 ch = var_buf[var_len];
1418 var_buf[var_len] = '\0';
1419 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1420 var_buf[var_len] = ch;
1423 if (Tcl_Eval (spec->tcl_interp, code->str) != TCL_OK)
1425 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1426 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1427 spec->tcl_interp->errorLine,
1428 spec->tcl_interp->result,
1429 err ? err : "[NO ERRORINFO]");
1436 static void execCode (struct lexSpec *spec, struct regxCode *code)
1438 const char *s = code->str;
1440 const char *cmd_str;
1442 r = execTok (spec, &s, &cmd_str, &cmd_len);
1449 r = execTok (spec, &s, &cmd_str, &cmd_len);
1452 p = regxStrz (cmd_str, cmd_len, ptmp);
1453 if (!strcmp (p, "begin"))
1455 r = execTok (spec, &s, &cmd_str, &cmd_len);
1458 logf (LOG_WARN, "missing keyword after 'begin'");
1461 p = regxStrz (cmd_str, cmd_len, ptmp);
1462 if (!strcmp (p, "record"))
1464 r = execTok (spec, &s, &cmd_str, &cmd_len);
1467 if (spec->d1_level == 0)
1469 static char absynName[64];
1474 memcpy (absynName, cmd_str, cmd_len);
1475 absynName[cmd_len] = '\0';
1478 logf (LOG_DEBUG, "begin record %s", absynName);
1480 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1481 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1486 res = data1_mk_node (spec->dh, spec->m);
1487 res->which = DATA1N_root;
1488 res->u.root.type = absynName;
1489 res->u.root.absyn = absyn;
1492 spec->d1_stack[spec->d1_level] = res;
1493 spec->d1_stack[++(spec->d1_level)] = NULL;
1496 r = execTok (spec, &s, &cmd_str, &cmd_len);
1498 else if (!strcmp (p, "element"))
1500 r = execTok (spec, &s, &cmd_str, &cmd_len);
1503 tagBegin (spec, cmd_str, cmd_len);
1504 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 else if (!strcmp (p, "variant"))
1509 const char *class_str = NULL;
1511 const char *type_str = NULL;
1513 const char *value_str = NULL;
1514 r = execTok (spec, &s, &cmd_str, &cmd_len);
1517 class_str = cmd_str;
1518 class_len = cmd_len;
1519 r = execTok (spec, &s, &cmd_str, &cmd_len);
1525 r = execTok (spec, &s, &cmd_str, &cmd_len);
1528 value_str = cmd_str;
1529 value_len = cmd_len;
1531 variantBegin (spec, class_str, class_len,
1532 type_str, type_len, value_str, value_len);
1535 r = execTok (spec, &s, &cmd_str, &cmd_len);
1537 else if (!strcmp (p, "context"))
1541 struct lexContext *lc = spec->context;
1542 r = execTok (spec, &s, &cmd_str, &cmd_len);
1543 p = regxStrz (cmd_str, cmd_len, ptmp);
1545 logf (LOG_DEBUG, "begin context %s", p);
1547 while (lc && strcmp (p, lc->name))
1550 spec->context_stack[++(spec->context_stack_top)] = lc;
1552 logf (LOG_WARN, "unknown context %s", p);
1555 r = execTok (spec, &s, &cmd_str, &cmd_len);
1559 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1562 else if (!strcmp (p, "end"))
1564 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 logf (LOG_WARN, "missing keyword after 'end'");
1570 p = regxStrz (cmd_str, cmd_len, ptmp);
1571 if (!strcmp (p, "record"))
1573 while (spec->d1_level)
1575 tagDataRelease (spec);
1578 r = execTok (spec, &s, &cmd_str, &cmd_len);
1580 logf (LOG_DEBUG, "end record");
1582 spec->stop_flag = 1;
1584 else if (!strcmp (p, "element"))
1587 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1589 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1594 tagEnd (spec, min_level, cmd_str, cmd_len);
1595 r = execTok (spec, &s, &cmd_str, &cmd_len);
1598 tagEnd (spec, min_level, NULL, 0);
1599 if (spec->d1_level == 0)
1602 logf (LOG_DEBUG, "end element end records");
1604 spec->stop_flag = 1;
1608 else if (!strcmp (p, "context"))
1611 logf (LOG_DEBUG, "end context");
1613 if (spec->context_stack_top)
1614 (spec->context_stack_top)--;
1615 r = execTok (spec, &s, &cmd_str, &cmd_len);
1618 logf (LOG_WARN, "bad keyword '%s' after end", p);
1620 else if (!strcmp (p, "data"))
1624 const char *element_str = NULL;
1626 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1628 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1630 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1632 r = execTok (spec, &s, &element_str, &element_len);
1637 logf (LOG_WARN, "bad data option: %.*s",
1642 logf (LOG_WARN, "missing data item after data");
1646 tagBegin (spec, element_str, element_len);
1649 execData (spec, cmd_str, cmd_len,textFlag);
1650 r = execTok (spec, &s, &cmd_str, &cmd_len);
1653 tagEnd (spec, 1, NULL, 0);
1655 else if (!strcmp (p, "unread"))
1658 r = execTok (spec, &s, &cmd_str, &cmd_len);
1659 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1661 r = execTok (spec, &s, &cmd_str, &cmd_len);
1664 logf (LOG_WARN, "missing number after -offset");
1667 p = regxStrz (cmd_str, cmd_len, ptmp);
1669 r = execTok (spec, &s, &cmd_str, &cmd_len);
1675 logf (LOG_WARN, "missing index after unread command");
1678 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1680 logf (LOG_WARN, "bad index after unread command");
1685 no = *cmd_str - '0';
1686 if (no >= spec->arg_no)
1687 no = spec->arg_no - 1;
1688 spec->ptr = spec->arg_start[no] + offset;
1690 r = execTok (spec, &s, &cmd_str, &cmd_len);
1692 else if (!strcmp (p, "context"))
1696 struct lexContext *lc = spec->context;
1697 r = execTok (spec, &s, &cmd_str, &cmd_len);
1698 p = regxStrz (cmd_str, cmd_len, ptmp);
1700 while (lc && strcmp (p, lc->name))
1703 spec->context_stack[spec->context_stack_top] = lc;
1705 logf (LOG_WARN, "unknown context %s", p);
1708 r = execTok (spec, &s, &cmd_str, &cmd_len);
1712 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1713 r = execTok (spec, &s, &cmd_str, &cmd_len);
1718 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1720 r = execTok (spec, &s, &cmd_str, &cmd_len);
1727 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1728 int start_ptr, int *pptr)
1737 arg_start[0] = start_ptr;
1739 spec->arg_start = arg_start;
1740 spec->arg_end = arg_end;
1747 if (ap->u.pattern.body)
1749 arg_start[arg_no] = *pptr;
1750 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1752 arg_end[arg_no] = F_WIN_EOF;
1754 arg_start[arg_no] = F_WIN_EOF;
1755 arg_end[arg_no] = F_WIN_EOF;
1760 arg_end[arg_no] = sptr;
1762 arg_start[arg_no] = sptr;
1763 arg_end[arg_no] = *pptr;
1768 arg_start[arg_no] = *pptr;
1769 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1771 if (sptr != arg_start[arg_no])
1773 arg_end[arg_no] = *pptr;
1778 spec->arg_no = arg_no;
1781 if (spec->tcl_interp)
1782 execTcl(spec, ap->u.code);
1784 execCode (spec, ap->u.code);
1786 execCode (spec, ap->u.code);
1789 if (spec->stop_flag)
1793 arg_start[arg_no] = *pptr;
1794 arg_end[arg_no] = F_WIN_EOF;
1803 static int execRule (struct lexSpec *spec, struct lexContext *context,
1804 int ruleNo, int start_ptr, int *pptr)
1807 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1809 return execAction (spec, context->fastRule[ruleNo]->actionList,
1813 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1815 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1816 struct DFA_state *state = context->dfa->states[0];
1819 unsigned char c_prev = '\n';
1821 int last_rule = 0; /* rule number of current match */
1822 int last_ptr = *ptr; /* last char of match */
1823 int start_ptr = *ptr; /* first char of match */
1824 int skip_ptr = *ptr; /* first char of run */
1828 c = f_win_advance (spec, ptr);
1829 if (*ptr == F_WIN_EOF)
1831 /* end of file met */
1834 /* there was a match */
1835 if (skip_ptr < start_ptr)
1837 /* deal with chars that didn't match */
1840 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1841 execDataP (spec, buf, size, 0);
1843 /* restore pointer */
1846 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1848 /* restore skip pointer */
1852 else if (skip_ptr < *ptr)
1854 /* deal with chars that didn't match */
1857 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1858 execDataP (spec, buf, size, 0);
1860 if (*ptr == F_WIN_EOF)
1867 { /* no transition for character c ... */
1870 if (skip_ptr < start_ptr)
1872 /* deal with chars that didn't match */
1875 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1876 execDataP (spec, buf, size, 0);
1878 /* restore pointer */
1880 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1882 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1885 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1887 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1891 context = spec->context_stack[spec->context_stack_top];
1894 last_ptr = start_ptr = *ptr;
1898 c_prev = f_win_advance (spec, &start_ptr);
1903 c_prev = f_win_advance (spec, &start_ptr);
1906 state = context->dfa->states[0];
1909 else if (c >= t->ch[0] && c <= t->ch[1])
1910 { /* transition ... */
1911 state = context->dfa->states[t->to];
1916 last_rule = state->rule_no;
1919 else if (state->rule_nno)
1921 last_rule = state->rule_nno;
1933 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1934 const char *context_name)
1936 struct lexContext *lt = spec->context;
1939 spec->stop_flag = 0;
1941 spec->context_stack_top = 0;
1944 if (!strcmp (lt->name, context_name))
1950 logf (LOG_WARN, "cannot find context %s", context_name);
1953 spec->context_stack[spec->context_stack_top] = lt;
1954 spec->d1_stack[spec->d1_level] = NULL;
1959 execAction (spec, lt->initActionList, ptr, &ptr);
1962 execAction (spec, lt->beginActionList, ptr, &ptr);
1963 lexNode (spec, &ptr);
1964 while (spec->d1_level)
1966 tagDataRelease (spec);
1969 execAction (spec, lt->endActionList, ptr, &ptr);
1970 return spec->d1_stack[0];
1973 void grs_destroy(void *clientData)
1975 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1978 lexSpecDestroy(&specs->spec);
1983 void *grs_init(void)
1985 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1990 data1_node *grs_read_regx (struct grs_read_info *p)
1993 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1994 struct lexSpec **curLexSpec = &specs->spec;
1997 logf (LOG_DEBUG, "grs_read_regx");
1999 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2002 lexSpecDestroy (curLexSpec);
2003 *curLexSpec = lexSpecCreate (p->type, p->dh);
2004 res = readFileSpec (*curLexSpec);
2007 lexSpecDestroy (curLexSpec);
2011 (*curLexSpec)->dh = p->dh;
2014 (*curLexSpec)->f_win_start = 0;
2015 (*curLexSpec)->f_win_end = 0;
2016 (*curLexSpec)->f_win_rf = p->readf;
2017 (*curLexSpec)->f_win_sf = p->seekf;
2018 (*curLexSpec)->f_win_fh = p->fh;
2019 (*curLexSpec)->f_win_ef = p->endf;
2020 (*curLexSpec)->f_win_size = 500000;
2022 (*curLexSpec)->m = p->mem;
2023 return lexRoot (*curLexSpec, p->offset, "main");
2026 static struct recTypeGrs regx_type = {
2033 RecTypeGrs recTypeGrs_regx = ®x_type;
2036 data1_node *grs_read_tcl (struct grs_read_info *p)
2039 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2040 struct lexSpec **curLexSpec = &specs->spec;
2043 logf (LOG_DEBUG, "grs_read_tcl");
2045 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2047 Tcl_Interp *tcl_interp;
2049 lexSpecDestroy (curLexSpec);
2050 *curLexSpec = lexSpecCreate (p->type, p->dh);
2051 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2052 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2053 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2054 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2055 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2057 res = readFileSpec (*curLexSpec);
2060 lexSpecDestroy (curLexSpec);
2064 (*curLexSpec)->dh = p->dh;
2067 (*curLexSpec)->f_win_start = 0;
2068 (*curLexSpec)->f_win_end = 0;
2069 (*curLexSpec)->f_win_rf = p->readf;
2070 (*curLexSpec)->f_win_sf = p->seekf;
2071 (*curLexSpec)->f_win_fh = p->fh;
2072 (*curLexSpec)->f_win_ef = p->endf;
2073 (*curLexSpec)->f_win_size = 500000;
2075 (*curLexSpec)->m = p->mem;
2076 return lexRoot (*curLexSpec, p->offset, "main");
2079 static struct recTypeGrs tcl_type = {
2086 RecTypeGrs recTypeGrs_tcl = &tcl_type;