2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.30 1999-07-14 10:55:28 adam
10 * Revision 1.29 1999/07/12 07:27:54 adam
11 * Improved speed of Tcl processing. Fixed one memory leak.
13 * Revision 1.28 1999/07/06 12:26:04 adam
14 * Fixed filters so that MS-DOS CR is ignored.
16 * Revision 1.27 1999/06/28 13:25:40 quinn
17 * Improved diagnostics for Tcl
19 * Revision 1.26 1999/05/26 07:49:14 adam
22 * Revision 1.25 1999/05/25 12:33:32 adam
23 * Fixed bug in Tcl filter.
25 * Revision 1.24 1999/05/21 11:08:46 adam
26 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
27 * script so that it reads uninstalled Tcl source.
29 * Revision 1.23 1999/05/20 12:57:18 adam
30 * Implemented TCL filter. Updated recctrl system.
32 * Revision 1.22 1998/11/03 16:07:13 adam
35 * Revision 1.21 1998/11/03 15:43:39 adam
36 * Fixed bug introduced by previous commit.
38 * Revision 1.20 1998/11/03 14:51:28 adam
39 * Changed code so that it creates as few data1 nodes as possible.
41 * Revision 1.19 1998/11/03 10:22:39 adam
42 * Fixed memory leak that could occur for when large data1 node were
43 * concatenated. Data-type data1_nodes may have multiple nodes.
45 * Revision 1.18 1998/10/15 13:11:47 adam
46 * Added support for option -record for "end element". When specified
47 * end element will mark end-of-record when at outer-level.
49 * Revision 1.17 1998/07/01 10:13:51 adam
52 * Revision 1.16 1998/06/30 15:15:09 adam
53 * Tags are trimmed: white space removed before- and after the tag.
55 * Revision 1.15 1998/06/30 12:55:45 adam
58 * Revision 1.14 1998/03/05 08:41:00 adam
59 * Implemented rule contexts.
61 * Revision 1.13 1997/12/12 06:33:58 adam
62 * Fixed bug that showed up when multiple filter where used.
63 * Made one routine thread-safe.
65 * Revision 1.12 1997/11/18 10:03:24 adam
66 * Member num_children removed from data1_node.
68 * Revision 1.11 1997/11/06 11:41:01 adam
69 * Implemented "begin variant" for the sgml.regx filter.
71 * Revision 1.10 1997/10/31 12:36:12 adam
72 * Minor change that avoids compiler warning.
74 * Revision 1.9 1997/09/29 09:02:49 adam
75 * Fixed small bug (introduced by previous commit).
77 * Revision 1.8 1997/09/17 12:19:22 adam
78 * Zebra version corresponds to YAZ version 1.4.
79 * Changed Zebra server so that it doesn't depend on global common_resource.
81 * Revision 1.7 1997/07/15 16:33:07 adam
82 * Check for zero length in execData.
84 * Revision 1.6 1997/02/24 10:41:51 adam
85 * Cleanup of code and commented out the "end element-end-record" code.
87 * Revision 1.5 1997/02/19 16:22:33 adam
88 * Fixed "end element" to terminate record in outer-most level.
90 * Revision 1.4 1997/02/12 20:42:58 adam
91 * Changed some log messages.
93 * Revision 1.3 1996/11/08 14:05:33 adam
94 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
96 * Revision 1.2 1996/10/29 14:02:09 adam
97 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
98 * data1_get_tabpath is used.
100 * Revision 1.1 1996/10/11 10:57:30 adam
101 * New module recctrl. Used to manage records (extract/retrieval).
103 * Revision 1.24 1996/06/17 14:25:31 adam
104 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
106 * Revision 1.23 1996/06/04 10:19:00 adam
107 * Minor changes - removed include of ctype.h.
109 * Revision 1.22 1996/06/03 15:23:13 adam
110 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
112 * Revision 1.21 1996/05/14 16:58:38 adam
115 * Revision 1.20 1996/05/01 13:46:36 adam
116 * First work on multiple records in one file.
117 * New option, -offset, to the "unread" command in the filter module.
119 * Revision 1.19 1996/02/12 16:18:20 adam
120 * Yet another bug fix in implementation of unread command.
122 * Revision 1.18 1996/02/12 16:07:54 adam
123 * Bug fix in new unread command.
125 * Revision 1.17 1996/02/12 15:56:11 adam
126 * New code command: unread.
128 * Revision 1.16 1996/01/17 14:57:51 adam
129 * Prototype changed for reader functions in extract/retrieve. File
130 * is identified by 'void *' instead of 'int.
132 * Revision 1.15 1996/01/08 19:15:47 adam
133 * New input filter that works!
135 * Revision 1.14 1996/01/08 09:10:38 adam
136 * Yet another complete rework on this module.
138 * Revision 1.13 1995/12/15 17:21:50 adam
139 * This version is able to set data.formatted_text in data1-nodes.
141 * Revision 1.12 1995/12/15 16:20:10 adam
142 * The filter files (*.flt) are read from the path given by data1_tabpath.
144 * Revision 1.11 1995/12/15 12:35:16 adam
147 * Revision 1.10 1995/12/15 10:35:36 adam
150 * Revision 1.9 1995/12/14 16:38:48 adam
151 * Completely new attempt to make regular expression parsing.
153 * Revision 1.8 1995/12/13 17:16:59 adam
156 * Revision 1.7 1995/12/13 16:51:58 adam
157 * Modified to set last_child in data1_nodes.
158 * Uses destroy handler to free up data text nodes.
160 * Revision 1.6 1995/12/13 13:45:37 quinn
161 * Changed data1 to use nmem.
163 * Revision 1.5 1995/12/11 09:12:52 adam
164 * The rec_get function returns NULL if record doesn't exist - will
165 * happen in the server if the result set records have been deleted since
166 * the creation of the set (i.e. the search).
167 * The server saves a result temporarily if it is 'volatile', i.e. the
168 * set is register dependent.
170 * Revision 1.4 1995/12/05 16:57:40 adam
171 * More work on regular patterns.
173 * Revision 1.3 1995/12/05 09:37:09 adam
174 * One malloc was renamed to xmalloc.
176 * Revision 1.2 1995/12/04 17:59:24 adam
177 * More work on regular expression conversion.
179 * Revision 1.1 1995/12/04 14:25:30 adam
180 * Started work on regular expression parsed input to structured records.
189 #include <zebrautl.h>
199 #define F_WIN_EOF 2000000000
203 #define REGX_PATTERN 1
208 #define REGX_CONTEXT 6
218 struct lexRuleAction {
222 struct DFA *dfa; /* REGX_PATTERN */
225 struct regxCode *code; /* REGX_CODE */
227 struct lexRuleAction *next;
232 struct lexRuleAction *actionList;
236 struct lexRuleInfo info;
237 struct lexRule *next;
243 struct lexRule *rules;
244 struct lexRuleInfo **fastRule;
248 struct lexRuleAction *beginActionList;
249 struct lexRuleAction *endActionList;
250 struct lexRuleAction *initActionList;
251 struct lexContext *next;
254 struct lexConcatBuf {
261 struct lexContext *context;
263 struct lexContext **context_stack;
264 int context_stack_size;
265 int context_stack_top;
271 Tcl_Interp *tcl_interp;
274 void (*f_win_ef)(void *, off_t);
276 int f_win_start; /* first byte of buffer is this file offset */
277 int f_win_end; /* last byte of buffer is this offset - 1 */
278 int f_win_size; /* size of buffer */
279 char *f_win_buf; /* buffer itself */
280 int (*f_win_rf)(void *, char *, size_t);
281 off_t (*f_win_sf)(void *, off_t);
283 struct lexConcatBuf *concatBuf;
285 data1_node **d1_stack;
296 struct lexSpec *spec;
299 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
302 int i, r, off = start_pos - spec->f_win_start;
304 if (off >= 0 && end_pos <= spec->f_win_end)
306 *size = end_pos - start_pos;
307 return spec->f_win_buf + off;
309 if (off < 0 || start_pos >= spec->f_win_end)
311 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
312 spec->f_win_start = start_pos;
314 if (!spec->f_win_buf)
315 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
316 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
318 spec->f_win_end = spec->f_win_start + *size;
320 if (*size > end_pos - start_pos)
321 *size = end_pos - start_pos;
322 return spec->f_win_buf;
324 for (i = 0; i<spec->f_win_end - start_pos; i++)
325 spec->f_win_buf[i] = spec->f_win_buf[i + off];
326 r = (*spec->f_win_rf)(spec->f_win_fh,
328 spec->f_win_size - i);
329 spec->f_win_start = start_pos;
330 spec->f_win_end += r;
332 if (*size > end_pos - start_pos)
333 *size = end_pos - start_pos;
334 return spec->f_win_buf;
337 static int f_win_advance (struct lexSpec *spec, int *pos)
342 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
343 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
344 if (*pos == F_WIN_EOF)
346 buf = f_win_get (spec, *pos, *pos+1, &size);
356 static void regxCodeDel (struct regxCode **pp)
358 struct regxCode *p = *pp;
363 Tcl_DecrRefCount (p->tcl_obj);
371 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
375 p = (struct regxCode *) xmalloc (sizeof(*p));
376 p->str = (char *) xmalloc (len+1);
377 memcpy (p->str, buf, len);
380 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
382 Tcl_IncrRefCount (p->tcl_obj);
387 static struct DFA *lexSpecDFA (void)
392 dfa_parse_cmap_del (dfa, ' ');
393 dfa_parse_cmap_del (dfa, '\t');
394 dfa_parse_cmap_add (dfa, '/', 0);
398 static void actionListDel (struct lexRuleAction **rap)
400 struct lexRuleAction *ra1, *ra;
402 for (ra = *rap; ra; ra = ra1)
408 dfa_delete (&ra->u.pattern.dfa);
411 regxCodeDel (&ra->u.code);
419 static struct lexContext *lexContextCreate (const char *name)
421 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
423 p->name = xstrdup (name);
426 p->dfa = lexSpecDFA ();
429 p->beginActionList = NULL;
430 p->endActionList = NULL;
431 p->initActionList = NULL;
436 static void lexContextDestroy (struct lexContext *p)
438 struct lexRule *rp, *rp1;
440 dfa_delete (&p->dfa);
442 for (rp = p->rules; rp; rp = rp1)
445 actionListDel (&rp->info.actionList);
448 actionListDel (&p->beginActionList);
449 actionListDel (&p->endActionList);
450 actionListDel (&p->initActionList);
455 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
460 p = (struct lexSpec *) xmalloc (sizeof(*p));
461 p->name = (char *) xmalloc (strlen(name)+1);
462 strcpy (p->name, name);
469 p->context_stack_size = 100;
470 p->context_stack = (struct lexContext **)
471 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
475 p->concatBuf = (struct lexConcatBuf *)
476 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
477 for (i = 0; i < p->maxLevel; i++)
479 p->concatBuf[i].max = 0;
480 p->concatBuf[i].buf = 0;
482 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
487 static void lexSpecDestroy (struct lexSpec **pp)
490 struct lexContext *lt;
498 for (i = 0; i < p->maxLevel; i++)
499 xfree (p->concatBuf[i].buf);
500 xfree (p->concatBuf);
505 struct lexContext *lt_next = lt->next;
506 lexContextDestroy (lt);
511 Tcl_DeleteInterp (p->tcl_interp);
514 xfree (p->f_win_buf);
515 xfree (p->context_stack);
521 static int readParseToken (const char **cpp, int *len)
523 const char *cp = *cpp;
527 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
556 if (*cp >= 'a' && *cp <= 'z')
558 else if (*cp >= 'A' && *cp <= 'Z')
559 cmd[i] = *cp + 'a' - 'A';
562 if (i < (int) sizeof(cmd)-2)
569 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
571 while (*cp && *cp != ' ' && *cp != '\t' &&
572 *cp != '\n' && *cp != '\r')
578 if (!strcmp (cmd, "begin"))
580 else if (!strcmp (cmd, "end"))
582 else if (!strcmp (cmd, "body"))
584 else if (!strcmp (cmd, "context"))
586 else if (!strcmp (cmd, "init"))
590 logf (LOG_WARN, "bad command %s", cmd);
596 static int actionListMk (struct lexSpec *spec, const char *s,
597 struct lexRuleAction **ap)
603 while ((tok = readParseToken (&s, &len)))
611 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
613 regxCodeMk (&(*ap)->u.code, s, len);
617 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
619 (*ap)->u.pattern.body = bodyMark;
621 (*ap)->u.pattern.dfa = lexSpecDFA ();
623 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
628 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
631 dfa_mkstate ((*ap)->u.pattern.dfa);
635 logf (LOG_WARN, "cannot use BEGIN here");
638 logf (LOG_WARN, "cannot use INIT here");
641 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
651 int readOneSpec (struct lexSpec *spec, const char *s)
655 struct lexContext *lc;
657 tok = readParseToken (&s, &len);
658 if (tok == REGX_CONTEXT)
660 char context_name[32];
661 tok = readParseToken (&s, &len);
662 if (tok != REGX_CODE)
664 logf (LOG_WARN, "missing name after CONTEXT keyword");
669 memcpy (context_name, s, len);
670 context_name[len] = '\0';
671 lc = lexContextCreate (context_name);
672 lc->next = spec->context;
677 spec->context = lexContextCreate ("main");
682 actionListDel (&spec->context->beginActionList);
683 actionListMk (spec, s, &spec->context->beginActionList);
686 actionListDel (&spec->context->endActionList);
687 actionListMk (spec, s, &spec->context->endActionList);
690 actionListDel (&spec->context->initActionList);
691 actionListMk (spec, s, &spec->context->initActionList);
695 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
697 r = dfa_parse (spec->context->dfa, &s);
700 logf (LOG_WARN, "regular expression error. r=%d", r);
705 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
709 rp = (struct lexRule *) xmalloc (sizeof(*rp));
710 rp->info.no = spec->context->ruleNo++;
711 rp->next = spec->context->rules;
712 spec->context->rules = rp;
713 actionListMk (spec, s, &rp->info.actionList);
718 int readFileSpec (struct lexSpec *spec)
720 struct lexContext *lc;
721 int c, i, errors = 0;
727 if (spec->tcl_interp)
729 sprintf (fname, "%s.tflt", spec->name);
730 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
735 sprintf (fname, "%s.flt", spec->name);
736 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
740 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
743 logf (LOG_LOG, "reading regx filter %s", fname);
745 if (spec->tcl_interp)
746 logf (LOG_LOG, "Tcl enabled");
748 lineBuf = wrbuf_alloc();
753 wrbuf_rewind (lineBuf);
754 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
756 while (c != '\n' && c != EOF)
769 wrbuf_putc(lineBuf, c);
777 if (c != ' ' && c != '\t')
782 wrbuf_putc(lineBuf, '\0');
783 readOneSpec (spec, wrbuf_buf(lineBuf));
784 spec->lineNo += addLine;
788 wrbuf_free(lineBuf, 1);
793 debug_dfa_followpos = 1;
796 for (lc = spec->context; lc; lc = lc->next)
799 lc->fastRule = (struct lexRuleInfo **)
800 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
801 for (i = 0; i < lc->ruleNo; i++)
802 lc->fastRule[i] = NULL;
803 for (rp = lc->rules; rp; rp = rp->next)
804 lc->fastRule[rp->info.no] = &rp->info;
805 dfa_mkstate (lc->dfa);
814 static struct lexSpec *curLexSpec = NULL;
817 static void execData (struct lexSpec *spec,
818 const char *ebuf, int elen, int formatted_text)
820 struct data1_node *res, *parent;
823 if (elen == 0) /* shouldn't happen, but it does! */
827 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
828 ebuf, 15, ebuf + elen-15);
830 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
832 logf (LOG_DEBUG, "data (%d bytes)", elen);
835 if (spec->d1_level <= 1)
838 parent = spec->d1_stack[spec->d1_level -1];
841 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
842 org_len = res->u.data.len;
847 res = data1_mk_node (spec->dh, spec->m);
848 res->parent = parent;
849 res->which = DATA1N_data;
850 res->u.data.what = DATA1I_text;
852 res->u.data.formatted_text = formatted_text;
854 if (elen > DATA1_LOCALDATA)
855 res->u.data.data = nmem_malloc (spec->m, elen);
857 res->u.data.data = res->lbuf;
858 memcpy (res->u.data.data, ebuf, elen);
860 res->u.data.data = 0;
862 res->root = parent->root;
864 parent->last_child = res;
865 if (spec->d1_stack[spec->d1_level])
866 spec->d1_stack[spec->d1_level]->next = res;
869 spec->d1_stack[spec->d1_level] = res;
871 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
873 char *old_buf, *new_buf;
875 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
876 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
877 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
879 memcpy (new_buf, old_buf, org_len);
882 spec->concatBuf[spec->d1_level].buf = new_buf;
884 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
885 res->u.data.len += elen;
888 static void execDataP (struct lexSpec *spec,
889 const char *ebuf, int elen, int formatted_text)
891 execData (spec, ebuf, elen, formatted_text);
894 static void tagDataRelease (struct lexSpec *spec)
898 if ((res = spec->d1_stack[spec->d1_level]) &&
899 res->which == DATA1N_data &&
900 res->u.data.what == DATA1I_text)
902 assert (!res->u.data.data);
903 assert (res->u.data.len > 0);
904 if (res->u.data.len > DATA1_LOCALDATA)
905 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
907 res->u.data.data = res->lbuf;
908 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
913 static void variantBegin (struct lexSpec *spec,
914 const char *class_str, int class_len,
915 const char *type_str, int type_len,
916 const char *value_str, int value_len)
918 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
919 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
924 if (spec->d1_level == 0)
926 logf (LOG_WARN, "in variant begin. No record type defined");
929 if (class_len >= DATA1_MAX_SYMBOL)
930 class_len = DATA1_MAX_SYMBOL-1;
931 memcpy (tclass, class_str, class_len);
932 tclass[class_len] = '\0';
934 if (type_len >= DATA1_MAX_SYMBOL)
935 type_len = DATA1_MAX_SYMBOL-1;
936 memcpy (ttype, type_str, type_len);
937 ttype[type_len] = '\0';
940 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
945 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
949 if (parent->which != DATA1N_variant)
951 res = data1_mk_node (spec->dh, spec->m);
952 res->parent = parent;
953 res->which = DATA1N_variant;
954 res->u.variant.type = 0;
955 res->u.variant.value = 0;
956 res->root = parent->root;
958 parent->last_child = res;
959 if (spec->d1_stack[spec->d1_level])
961 tagDataRelease (spec);
962 spec->d1_stack[spec->d1_level]->next = res;
966 spec->d1_stack[spec->d1_level] = res;
967 spec->d1_stack[++(spec->d1_level)] = NULL;
969 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
970 if (spec->d1_stack[i]->u.variant.type == tp)
977 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
979 parent = spec->d1_stack[spec->d1_level-1];
980 res = data1_mk_node (spec->dh, spec->m);
981 res->parent = parent;
982 res->which = DATA1N_variant;
983 res->root = parent->root;
984 res->u.variant.type = tp;
986 if (value_len >= DATA1_LOCALDATA)
987 value_len =DATA1_LOCALDATA-1;
988 memcpy (res->lbuf, value_str, value_len);
989 res->lbuf[value_len] = '\0';
991 res->u.variant.value = res->lbuf;
993 parent->last_child = res;
994 if (spec->d1_stack[spec->d1_level])
996 tagDataRelease (spec);
997 spec->d1_stack[spec->d1_level]->next = res;
1000 parent->child = res;
1001 spec->d1_stack[spec->d1_level] = res;
1002 spec->d1_stack[++(spec->d1_level)] = NULL;
1005 static void tagStrip (const char **tag, int *len)
1009 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1012 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1018 static void tagBegin (struct lexSpec *spec,
1019 const char *tag, int len)
1021 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1022 data1_element *elem = NULL;
1023 data1_node *partag = get_parent_tag(spec->dh, parent);
1025 data1_element *e = NULL;
1028 if (spec->d1_level == 0)
1030 logf (LOG_WARN, "in element begin. No record type defined");
1033 tagStrip (&tag, &len);
1035 res = data1_mk_node (spec->dh, spec->m);
1036 res->parent = parent;
1037 res->which = DATA1N_tag;
1038 res->u.tag.get_bytes = -1;
1040 if (len >= DATA1_LOCALDATA)
1041 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1043 res->u.tag.tag = res->lbuf;
1045 memcpy (res->u.tag.tag, tag, len);
1046 res->u.tag.tag[len] = '\0';
1049 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1051 if (parent->which == DATA1N_variant)
1054 if (!(e = partag->u.tag.element))
1057 elem = data1_getelementbytagname (spec->dh,
1058 spec->d1_stack[0]->u.root.absyn,
1060 res->u.tag.element = elem;
1061 res->u.tag.node_selected = 0;
1062 res->u.tag.make_variantlist = 0;
1063 res->u.tag.no_data_requested = 0;
1064 res->root = parent->root;
1066 parent->last_child = res;
1067 if (spec->d1_stack[spec->d1_level])
1069 tagDataRelease (spec);
1070 spec->d1_stack[spec->d1_level]->next = res;
1073 parent->child = res;
1074 spec->d1_stack[spec->d1_level] = res;
1075 spec->d1_stack[++(spec->d1_level)] = NULL;
1078 static void tagEnd (struct lexSpec *spec, int min_level,
1079 const char *tag, int len)
1081 tagStrip (&tag, &len);
1082 while (spec->d1_level > min_level)
1084 tagDataRelease (spec);
1086 if (spec->d1_level == 0)
1088 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1090 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1092 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1096 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1101 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1104 struct DFA_state *state = dfa->states[0];
1107 unsigned char c_prev = 0;
1108 int ptr = *pptr; /* current pointer */
1109 int start_ptr = *pptr; /* first char of match */
1110 int last_ptr = 0; /* last char of match */
1111 int last_rule = 0; /* rule number of current match */
1116 c = f_win_advance (spec, &ptr);
1117 if (ptr == F_WIN_EOF)
1134 *mptr = start_ptr; /* match starts here */
1135 *pptr = last_ptr; /* match end here (+1) */
1138 state = dfa->states[0];
1143 else if (c >= t->ch[0] && c <= t->ch[1])
1145 state = dfa->states[t->to];
1150 last_rule = state->rule_no;
1155 last_rule = state->rule_nno;
1167 static int execTok (struct lexSpec *spec, const char **src,
1168 const char **tokBuf, int *tokLen)
1170 const char *s = *src;
1172 while (*s == ' ' || *s == '\t')
1176 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1180 while (*s >= '0' && *s <= '9')
1181 n = n*10 + (*s++ -'0');
1182 if (spec->arg_no == 0)
1189 if (n >= spec->arg_no)
1191 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1195 else if (*s == '\"')
1198 while (*s && *s != '\"')
1200 *tokLen = s - *tokBuf;
1205 else if (*s == '\n' || *s == ';')
1213 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1216 *tokLen = s - *tokBuf;
1223 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1226 *tokLen = s - *tokBuf;
1232 static char *regxStrz (const char *src, int len, char *str)
1236 memcpy (str, src, len);
1242 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1243 int argc, char **argv)
1245 struct lexSpec *spec = (struct lexSpec *) clientData;
1248 if (!strcmp(argv[1], "record") && argc == 3)
1250 char *absynName = argv[2];
1254 logf (LOG_DEBUG, "begin record %s", absynName);
1256 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1257 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1262 res = data1_mk_node (spec->dh, spec->m);
1263 res->which = DATA1N_root;
1264 res->u.root.type = absynName;
1265 res->u.root.absyn = absyn;
1268 spec->d1_stack[spec->d1_level] = res;
1269 spec->d1_stack[++(spec->d1_level)] = NULL;
1272 else if (!strcmp(argv[1], "element") && argc == 3)
1274 tagBegin (spec, argv[2], strlen(argv[2]));
1276 else if (!strcmp (argv[1], "variant") && argc == 5)
1278 variantBegin (spec, argv[2], strlen(argv[2]),
1279 argv[3], strlen(argv[3]),
1280 argv[4], strlen(argv[4]));
1282 else if (!strcmp (argv[1], "context") && argc == 3)
1284 struct lexContext *lc = spec->context;
1286 logf (LOG_DEBUG, "begin context %s",argv[2]);
1288 while (lc && strcmp (argv[2], lc->name))
1292 spec->context_stack[++(spec->context_stack_top)] = lc;
1295 logf (LOG_WARN, "unknown context %s", argv[2]);
1302 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1303 int argc, char **argv)
1305 struct lexSpec *spec = (struct lexSpec *) clientData;
1309 if (!strcmp (argv[1], "record"))
1311 while (spec->d1_level)
1313 tagDataRelease (spec);
1317 logf (LOG_DEBUG, "end record");
1319 spec->stop_flag = 1;
1321 else if (!strcmp (argv[1], "element"))
1325 if (argc >= 3 && !strcmp(argv[2], "-record"))
1334 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1335 if (spec->d1_level == 0)
1338 logf (LOG_DEBUG, "end element end records");
1340 spec->stop_flag = 1;
1343 else if (!strcmp (argv[1], "context"))
1346 logf (LOG_DEBUG, "end context");
1348 if (spec->context_stack_top)
1349 (spec->context_stack_top)--;
1356 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1357 int argc, char **argv)
1361 const char *element = 0;
1362 struct lexSpec *spec = (struct lexSpec *) clientData;
1366 if (!strcmp("-text", argv[argi]))
1371 else if (!strcmp("-element", argv[argi]))
1375 element = argv[argi++];
1381 tagBegin (spec, element, strlen(element));
1385 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1389 tagEnd (spec, 1, NULL, 0);
1393 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1394 int argc, char **argv)
1396 struct lexSpec *spec = (struct lexSpec *) clientData;
1403 if (!strcmp("-offset", argv[argi]))
1408 offset = atoi(argv[argi]);
1417 no = atoi(argv[argi]);
1418 if (no >= spec->arg_no)
1419 no = spec->arg_no - 1;
1420 spec->ptr = spec->arg_start[no] + offset;
1424 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1428 for (i = 0; i < spec->arg_no; i++)
1430 char var_name[10], *var_buf;
1433 sprintf (var_name, "%d", i);
1434 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1438 ch = var_buf[var_len];
1439 var_buf[var_len] = '\0';
1440 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1441 var_buf[var_len] = ch;
1445 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1447 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1450 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1451 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1452 spec->tcl_interp->errorLine,
1453 spec->tcl_interp->result,
1454 err ? err : "[NO ERRORINFO]");
1460 static void execCode (struct lexSpec *spec, struct regxCode *code)
1462 const char *s = code->str;
1464 const char *cmd_str;
1466 r = execTok (spec, &s, &cmd_str, &cmd_len);
1473 r = execTok (spec, &s, &cmd_str, &cmd_len);
1476 p = regxStrz (cmd_str, cmd_len, ptmp);
1477 if (!strcmp (p, "begin"))
1479 r = execTok (spec, &s, &cmd_str, &cmd_len);
1482 logf (LOG_WARN, "missing keyword after 'begin'");
1485 p = regxStrz (cmd_str, cmd_len, ptmp);
1486 if (!strcmp (p, "record"))
1488 r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 if (spec->d1_level == 0)
1493 static char absynName[64];
1498 memcpy (absynName, cmd_str, cmd_len);
1499 absynName[cmd_len] = '\0';
1502 logf (LOG_DEBUG, "begin record %s", absynName);
1504 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1505 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1510 res = data1_mk_node (spec->dh, spec->m);
1511 res->which = DATA1N_root;
1512 res->u.root.type = absynName;
1513 res->u.root.absyn = absyn;
1516 spec->d1_stack[spec->d1_level] = res;
1517 spec->d1_stack[++(spec->d1_level)] = NULL;
1520 r = execTok (spec, &s, &cmd_str, &cmd_len);
1522 else if (!strcmp (p, "element"))
1524 r = execTok (spec, &s, &cmd_str, &cmd_len);
1527 tagBegin (spec, cmd_str, cmd_len);
1528 r = execTok (spec, &s, &cmd_str, &cmd_len);
1530 else if (!strcmp (p, "variant"))
1533 const char *class_str = NULL;
1535 const char *type_str = NULL;
1537 const char *value_str = NULL;
1538 r = execTok (spec, &s, &cmd_str, &cmd_len);
1541 class_str = cmd_str;
1542 class_len = cmd_len;
1543 r = execTok (spec, &s, &cmd_str, &cmd_len);
1549 r = execTok (spec, &s, &cmd_str, &cmd_len);
1552 value_str = cmd_str;
1553 value_len = cmd_len;
1555 variantBegin (spec, class_str, class_len,
1556 type_str, type_len, value_str, value_len);
1559 r = execTok (spec, &s, &cmd_str, &cmd_len);
1561 else if (!strcmp (p, "context"))
1565 struct lexContext *lc = spec->context;
1566 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 p = regxStrz (cmd_str, cmd_len, ptmp);
1569 logf (LOG_DEBUG, "begin context %s", p);
1571 while (lc && strcmp (p, lc->name))
1574 spec->context_stack[++(spec->context_stack_top)] = lc;
1576 logf (LOG_WARN, "unknown context %s", p);
1579 r = execTok (spec, &s, &cmd_str, &cmd_len);
1583 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1586 else if (!strcmp (p, "end"))
1588 r = execTok (spec, &s, &cmd_str, &cmd_len);
1591 logf (LOG_WARN, "missing keyword after 'end'");
1594 p = regxStrz (cmd_str, cmd_len, ptmp);
1595 if (!strcmp (p, "record"))
1597 while (spec->d1_level)
1599 tagDataRelease (spec);
1602 r = execTok (spec, &s, &cmd_str, &cmd_len);
1604 logf (LOG_DEBUG, "end record");
1606 spec->stop_flag = 1;
1608 else if (!strcmp (p, "element"))
1611 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1613 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1618 tagEnd (spec, min_level, cmd_str, cmd_len);
1619 r = execTok (spec, &s, &cmd_str, &cmd_len);
1622 tagEnd (spec, min_level, NULL, 0);
1623 if (spec->d1_level == 0)
1626 logf (LOG_DEBUG, "end element end records");
1628 spec->stop_flag = 1;
1632 else if (!strcmp (p, "context"))
1635 logf (LOG_DEBUG, "end context");
1637 if (spec->context_stack_top)
1638 (spec->context_stack_top)--;
1639 r = execTok (spec, &s, &cmd_str, &cmd_len);
1642 logf (LOG_WARN, "bad keyword '%s' after end", p);
1644 else if (!strcmp (p, "data"))
1648 const char *element_str = NULL;
1650 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1652 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1654 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1656 r = execTok (spec, &s, &element_str, &element_len);
1661 logf (LOG_WARN, "bad data option: %.*s",
1666 logf (LOG_WARN, "missing data item after data");
1670 tagBegin (spec, element_str, element_len);
1673 execData (spec, cmd_str, cmd_len,textFlag);
1674 r = execTok (spec, &s, &cmd_str, &cmd_len);
1677 tagEnd (spec, 1, NULL, 0);
1679 else if (!strcmp (p, "unread"))
1682 r = execTok (spec, &s, &cmd_str, &cmd_len);
1683 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1685 r = execTok (spec, &s, &cmd_str, &cmd_len);
1688 logf (LOG_WARN, "missing number after -offset");
1691 p = regxStrz (cmd_str, cmd_len, ptmp);
1693 r = execTok (spec, &s, &cmd_str, &cmd_len);
1699 logf (LOG_WARN, "missing index after unread command");
1702 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1704 logf (LOG_WARN, "bad index after unread command");
1709 no = *cmd_str - '0';
1710 if (no >= spec->arg_no)
1711 no = spec->arg_no - 1;
1712 spec->ptr = spec->arg_start[no] + offset;
1714 r = execTok (spec, &s, &cmd_str, &cmd_len);
1716 else if (!strcmp (p, "context"))
1720 struct lexContext *lc = spec->context;
1721 r = execTok (spec, &s, &cmd_str, &cmd_len);
1722 p = regxStrz (cmd_str, cmd_len, ptmp);
1724 while (lc && strcmp (p, lc->name))
1727 spec->context_stack[spec->context_stack_top] = lc;
1729 logf (LOG_WARN, "unknown context %s", p);
1732 r = execTok (spec, &s, &cmd_str, &cmd_len);
1736 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1737 r = execTok (spec, &s, &cmd_str, &cmd_len);
1742 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1744 r = execTok (spec, &s, &cmd_str, &cmd_len);
1751 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1752 int start_ptr, int *pptr)
1761 arg_start[0] = start_ptr;
1763 spec->arg_start = arg_start;
1764 spec->arg_end = arg_end;
1771 if (ap->u.pattern.body)
1773 arg_start[arg_no] = *pptr;
1774 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1776 arg_end[arg_no] = F_WIN_EOF;
1778 arg_start[arg_no] = F_WIN_EOF;
1779 arg_end[arg_no] = F_WIN_EOF;
1784 arg_end[arg_no] = sptr;
1786 arg_start[arg_no] = sptr;
1787 arg_end[arg_no] = *pptr;
1792 arg_start[arg_no] = *pptr;
1793 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1795 if (sptr != arg_start[arg_no])
1797 arg_end[arg_no] = *pptr;
1802 spec->arg_no = arg_no;
1805 if (spec->tcl_interp)
1806 execTcl(spec, ap->u.code);
1808 execCode (spec, ap->u.code);
1810 execCode (spec, ap->u.code);
1813 if (spec->stop_flag)
1817 arg_start[arg_no] = *pptr;
1818 arg_end[arg_no] = F_WIN_EOF;
1827 static int execRule (struct lexSpec *spec, struct lexContext *context,
1828 int ruleNo, int start_ptr, int *pptr)
1831 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1833 return execAction (spec, context->fastRule[ruleNo]->actionList,
1837 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1839 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1840 struct DFA_state *state = context->dfa->states[0];
1843 unsigned char c_prev = '\n';
1845 int last_rule = 0; /* rule number of current match */
1846 int last_ptr = *ptr; /* last char of match */
1847 int start_ptr = *ptr; /* first char of match */
1848 int skip_ptr = *ptr; /* first char of run */
1852 c = f_win_advance (spec, ptr);
1853 if (*ptr == F_WIN_EOF)
1855 /* end of file met */
1858 /* there was a match */
1859 if (skip_ptr < start_ptr)
1861 /* deal with chars that didn't match */
1864 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1865 execDataP (spec, buf, size, 0);
1867 /* restore pointer */
1870 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1872 /* restore skip pointer */
1876 else if (skip_ptr < *ptr)
1878 /* deal with chars that didn't match */
1881 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1882 execDataP (spec, buf, size, 0);
1884 if (*ptr == F_WIN_EOF)
1891 { /* no transition for character c ... */
1894 if (skip_ptr < start_ptr)
1896 /* deal with chars that didn't match */
1899 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1900 execDataP (spec, buf, size, 0);
1902 /* restore pointer */
1904 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1906 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1909 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1911 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1915 context = spec->context_stack[spec->context_stack_top];
1918 last_ptr = start_ptr = *ptr;
1922 c_prev = f_win_advance (spec, &start_ptr);
1927 c_prev = f_win_advance (spec, &start_ptr);
1930 state = context->dfa->states[0];
1933 else if (c >= t->ch[0] && c <= t->ch[1])
1934 { /* transition ... */
1935 state = context->dfa->states[t->to];
1940 last_rule = state->rule_no;
1943 else if (state->rule_nno)
1945 last_rule = state->rule_nno;
1957 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1958 const char *context_name)
1960 struct lexContext *lt = spec->context;
1963 spec->stop_flag = 0;
1965 spec->context_stack_top = 0;
1968 if (!strcmp (lt->name, context_name))
1974 logf (LOG_WARN, "cannot find context %s", context_name);
1977 spec->context_stack[spec->context_stack_top] = lt;
1978 spec->d1_stack[spec->d1_level] = NULL;
1983 execAction (spec, lt->initActionList, ptr, &ptr);
1986 execAction (spec, lt->beginActionList, ptr, &ptr);
1987 lexNode (spec, &ptr);
1988 while (spec->d1_level)
1990 tagDataRelease (spec);
1993 execAction (spec, lt->endActionList, ptr, &ptr);
1994 return spec->d1_stack[0];
1997 void grs_destroy(void *clientData)
1999 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2002 lexSpecDestroy(&specs->spec);
2007 void *grs_init(void)
2009 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2014 data1_node *grs_read_regx (struct grs_read_info *p)
2017 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2018 struct lexSpec **curLexSpec = &specs->spec;
2021 logf (LOG_DEBUG, "grs_read_regx");
2023 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2026 lexSpecDestroy (curLexSpec);
2027 *curLexSpec = lexSpecCreate (p->type, p->dh);
2028 res = readFileSpec (*curLexSpec);
2031 lexSpecDestroy (curLexSpec);
2035 (*curLexSpec)->dh = p->dh;
2038 (*curLexSpec)->f_win_start = 0;
2039 (*curLexSpec)->f_win_end = 0;
2040 (*curLexSpec)->f_win_rf = p->readf;
2041 (*curLexSpec)->f_win_sf = p->seekf;
2042 (*curLexSpec)->f_win_fh = p->fh;
2043 (*curLexSpec)->f_win_ef = p->endf;
2044 (*curLexSpec)->f_win_size = 500000;
2046 (*curLexSpec)->m = p->mem;
2047 return lexRoot (*curLexSpec, p->offset, "main");
2050 static struct recTypeGrs regx_type = {
2057 RecTypeGrs recTypeGrs_regx = ®x_type;
2060 data1_node *grs_read_tcl (struct grs_read_info *p)
2063 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2064 struct lexSpec **curLexSpec = &specs->spec;
2067 logf (LOG_DEBUG, "grs_read_tcl");
2069 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2071 Tcl_Interp *tcl_interp;
2073 lexSpecDestroy (curLexSpec);
2074 *curLexSpec = lexSpecCreate (p->type, p->dh);
2075 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2076 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2077 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2078 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2079 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2081 res = readFileSpec (*curLexSpec);
2084 lexSpecDestroy (curLexSpec);
2088 (*curLexSpec)->dh = p->dh;
2091 (*curLexSpec)->f_win_start = 0;
2092 (*curLexSpec)->f_win_end = 0;
2093 (*curLexSpec)->f_win_rf = p->readf;
2094 (*curLexSpec)->f_win_sf = p->seekf;
2095 (*curLexSpec)->f_win_fh = p->fh;
2096 (*curLexSpec)->f_win_ef = p->endf;
2097 (*curLexSpec)->f_win_size = 500000;
2099 (*curLexSpec)->m = p->mem;
2100 return lexRoot (*curLexSpec, p->offset, "main");
2103 static struct recTypeGrs tcl_type = {
2110 RecTypeGrs recTypeGrs_tcl = &tcl_type;