2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.28 1999-07-06 12:26:04 adam
8 * Fixed filters so that MS-DOS CR is ignored.
10 * Revision 1.27 1999/06/28 13:25:40 quinn
11 * Improved diagnostics for Tcl
13 * Revision 1.26 1999/05/26 07:49:14 adam
16 * Revision 1.25 1999/05/25 12:33:32 adam
17 * Fixed bug in Tcl filter.
19 * Revision 1.24 1999/05/21 11:08:46 adam
20 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
21 * script so that it reads uninstalled Tcl source.
23 * Revision 1.23 1999/05/20 12:57:18 adam
24 * Implemented TCL filter. Updated recctrl system.
26 * Revision 1.22 1998/11/03 16:07:13 adam
29 * Revision 1.21 1998/11/03 15:43:39 adam
30 * Fixed bug introduced by previous commit.
32 * Revision 1.20 1998/11/03 14:51:28 adam
33 * Changed code so that it creates as few data1 nodes as possible.
35 * Revision 1.19 1998/11/03 10:22:39 adam
36 * Fixed memory leak that could occur for when large data1 node were
37 * concatenated. Data-type data1_nodes may have multiple nodes.
39 * Revision 1.18 1998/10/15 13:11:47 adam
40 * Added support for option -record for "end element". When specified
41 * end element will mark end-of-record when at outer-level.
43 * Revision 1.17 1998/07/01 10:13:51 adam
46 * Revision 1.16 1998/06/30 15:15:09 adam
47 * Tags are trimmed: white space removed before- and after the tag.
49 * Revision 1.15 1998/06/30 12:55:45 adam
52 * Revision 1.14 1998/03/05 08:41:00 adam
53 * Implemented rule contexts.
55 * Revision 1.13 1997/12/12 06:33:58 adam
56 * Fixed bug that showed up when multiple filter where used.
57 * Made one routine thread-safe.
59 * Revision 1.12 1997/11/18 10:03:24 adam
60 * Member num_children removed from data1_node.
62 * Revision 1.11 1997/11/06 11:41:01 adam
63 * Implemented "begin variant" for the sgml.regx filter.
65 * Revision 1.10 1997/10/31 12:36:12 adam
66 * Minor change that avoids compiler warning.
68 * Revision 1.9 1997/09/29 09:02:49 adam
69 * Fixed small bug (introduced by previous commit).
71 * Revision 1.8 1997/09/17 12:19:22 adam
72 * Zebra version corresponds to YAZ version 1.4.
73 * Changed Zebra server so that it doesn't depend on global common_resource.
75 * Revision 1.7 1997/07/15 16:33:07 adam
76 * Check for zero length in execData.
78 * Revision 1.6 1997/02/24 10:41:51 adam
79 * Cleanup of code and commented out the "end element-end-record" code.
81 * Revision 1.5 1997/02/19 16:22:33 adam
82 * Fixed "end element" to terminate record in outer-most level.
84 * Revision 1.4 1997/02/12 20:42:58 adam
85 * Changed some log messages.
87 * Revision 1.3 1996/11/08 14:05:33 adam
88 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
90 * Revision 1.2 1996/10/29 14:02:09 adam
91 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
92 * data1_get_tabpath is used.
94 * Revision 1.1 1996/10/11 10:57:30 adam
95 * New module recctrl. Used to manage records (extract/retrieval).
97 * Revision 1.24 1996/06/17 14:25:31 adam
98 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
100 * Revision 1.23 1996/06/04 10:19:00 adam
101 * Minor changes - removed include of ctype.h.
103 * Revision 1.22 1996/06/03 15:23:13 adam
104 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
106 * Revision 1.21 1996/05/14 16:58:38 adam
109 * Revision 1.20 1996/05/01 13:46:36 adam
110 * First work on multiple records in one file.
111 * New option, -offset, to the "unread" command in the filter module.
113 * Revision 1.19 1996/02/12 16:18:20 adam
114 * Yet another bug fix in implementation of unread command.
116 * Revision 1.18 1996/02/12 16:07:54 adam
117 * Bug fix in new unread command.
119 * Revision 1.17 1996/02/12 15:56:11 adam
120 * New code command: unread.
122 * Revision 1.16 1996/01/17 14:57:51 adam
123 * Prototype changed for reader functions in extract/retrieve. File
124 * is identified by 'void *' instead of 'int.
126 * Revision 1.15 1996/01/08 19:15:47 adam
127 * New input filter that works!
129 * Revision 1.14 1996/01/08 09:10:38 adam
130 * Yet another complete rework on this module.
132 * Revision 1.13 1995/12/15 17:21:50 adam
133 * This version is able to set data.formatted_text in data1-nodes.
135 * Revision 1.12 1995/12/15 16:20:10 adam
136 * The filter files (*.flt) are read from the path given by data1_tabpath.
138 * Revision 1.11 1995/12/15 12:35:16 adam
141 * Revision 1.10 1995/12/15 10:35:36 adam
144 * Revision 1.9 1995/12/14 16:38:48 adam
145 * Completely new attempt to make regular expression parsing.
147 * Revision 1.8 1995/12/13 17:16:59 adam
150 * Revision 1.7 1995/12/13 16:51:58 adam
151 * Modified to set last_child in data1_nodes.
152 * Uses destroy handler to free up data text nodes.
154 * Revision 1.6 1995/12/13 13:45:37 quinn
155 * Changed data1 to use nmem.
157 * Revision 1.5 1995/12/11 09:12:52 adam
158 * The rec_get function returns NULL if record doesn't exist - will
159 * happen in the server if the result set records have been deleted since
160 * the creation of the set (i.e. the search).
161 * The server saves a result temporarily if it is 'volatile', i.e. the
162 * set is register dependent.
164 * Revision 1.4 1995/12/05 16:57:40 adam
165 * More work on regular patterns.
167 * Revision 1.3 1995/12/05 09:37:09 adam
168 * One malloc was renamed to xmalloc.
170 * Revision 1.2 1995/12/04 17:59:24 adam
171 * More work on regular expression conversion.
173 * Revision 1.1 1995/12/04 14:25:30 adam
174 * Started work on regular expression parsed input to structured records.
183 #include <zebrautl.h>
193 #define F_WIN_EOF 2000000000
197 #define REGX_PATTERN 1
202 #define REGX_CONTEXT 6
209 struct lexRuleAction {
213 struct DFA *dfa; /* REGX_PATTERN */
216 struct regxCode *code; /* REGX_CODE */
218 struct lexRuleAction *next;
223 struct lexRuleAction *actionList;
227 struct lexRuleInfo info;
228 struct lexRule *next;
234 struct lexRule *rules;
235 struct lexRuleInfo **fastRule;
239 struct lexRuleAction *beginActionList;
240 struct lexRuleAction *endActionList;
241 struct lexRuleAction *initActionList;
242 struct lexContext *next;
245 struct lexConcatBuf {
253 struct lexContext *context;
255 struct lexContext **context_stack;
256 int context_stack_size;
257 int context_stack_top;
263 Tcl_Interp *tcl_interp;
266 void (*f_win_ef)(void *, off_t);
268 int f_win_start; /* first byte of buffer is this file offset */
269 int f_win_end; /* last byte of buffer is this offset - 1 */
270 int f_win_size; /* size of buffer */
271 char *f_win_buf; /* buffer itself */
272 int (*f_win_rf)(void *, char *, size_t);
273 off_t (*f_win_sf)(void *, off_t);
275 struct lexConcatBuf **concatBuf;
277 data1_node **d1_stack;
288 struct lexSpec *spec;
291 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
294 int i, r, off = start_pos - spec->f_win_start;
296 if (off >= 0 && end_pos <= spec->f_win_end)
298 *size = end_pos - start_pos;
299 return spec->f_win_buf + off;
301 if (off < 0 || start_pos >= spec->f_win_end)
303 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
304 spec->f_win_start = start_pos;
306 if (!spec->f_win_buf)
307 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
308 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
310 spec->f_win_end = spec->f_win_start + *size;
312 if (*size > end_pos - start_pos)
313 *size = end_pos - start_pos;
314 return spec->f_win_buf;
316 for (i = 0; i<spec->f_win_end - start_pos; i++)
317 spec->f_win_buf[i] = spec->f_win_buf[i + off];
318 r = (*spec->f_win_rf)(spec->f_win_fh,
320 spec->f_win_size - i);
321 spec->f_win_start = start_pos;
322 spec->f_win_end += r;
324 if (*size > end_pos - start_pos)
325 *size = end_pos - start_pos;
326 return spec->f_win_buf;
329 static int f_win_advance (struct lexSpec *spec, int *pos)
334 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
335 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
336 if (*pos == F_WIN_EOF)
338 buf = f_win_get (spec, *pos, *pos+1, &size);
348 static void regxCodeDel (struct regxCode **pp)
350 struct regxCode *p = *pp;
359 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
363 p = (struct regxCode *) xmalloc (sizeof(*p));
364 p->str = (char *) xmalloc (len+1);
365 memcpy (p->str, buf, len);
370 static struct DFA *lexSpecDFA (void)
375 dfa_parse_cmap_del (dfa, ' ');
376 dfa_parse_cmap_del (dfa, '\t');
377 dfa_parse_cmap_add (dfa, '/', 0);
381 static void actionListDel (struct lexRuleAction **rap)
383 struct lexRuleAction *ra1, *ra;
385 for (ra = *rap; ra; ra = ra1)
391 dfa_delete (&ra->u.pattern.dfa);
394 regxCodeDel (&ra->u.code);
402 static struct lexContext *lexContextCreate (const char *name)
404 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
406 p->name = xstrdup (name);
409 p->dfa = lexSpecDFA ();
412 p->beginActionList = NULL;
413 p->endActionList = NULL;
414 p->initActionList = NULL;
419 static void lexContextDestroy (struct lexContext *p)
421 struct lexRule *rp, *rp1;
424 for (rp = p->rules; rp; rp = rp1)
427 actionListDel (&rp->info.actionList);
430 actionListDel (&p->beginActionList);
431 actionListDel (&p->endActionList);
436 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
441 p = (struct lexSpec *) xmalloc (sizeof(*p));
442 p->name = (char *) xmalloc (strlen(name)+1);
443 strcpy (p->name, name);
450 p->context_stack_size = 100;
451 p->context_stack = (struct lexContext **)
452 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
456 p->concatBuf = (struct lexConcatBuf **)
457 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
458 for (i = 0; i < p->maxLevel; i++)
460 p->concatBuf[i] = (struct lexConcatBuf *)
461 xmalloc (sizeof(**p->concatBuf));
462 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
463 p->concatBuf[i]->buf = 0;
465 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
470 static void lexSpecDestroy (struct lexSpec **pp)
473 struct lexContext *lt;
481 for (i = 0; i < p->maxLevel; i++)
482 xfree (p->concatBuf[i]);
483 xfree (p->concatBuf);
488 struct lexContext *lt_next = lt->next;
489 lexContextDestroy (lt);
494 Tcl_DeleteInterp (p->tcl_interp);
497 xfree (p->f_win_buf);
498 xfree (p->context_stack);
504 static int readParseToken (const char **cpp, int *len)
506 const char *cp = *cpp;
510 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
539 if (*cp >= 'a' && *cp <= 'z')
541 else if (*cp >= 'A' && *cp <= 'Z')
542 cmd[i] = *cp + 'a' - 'A';
545 if (i < (int) sizeof(cmd)-2)
552 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
554 while (*cp && *cp != ' ' && *cp != '\t' &&
555 *cp != '\n' && *cp != '\r')
561 if (!strcmp (cmd, "begin"))
563 else if (!strcmp (cmd, "end"))
565 else if (!strcmp (cmd, "body"))
567 else if (!strcmp (cmd, "context"))
569 else if (!strcmp (cmd, "init"))
573 logf (LOG_WARN, "bad command %s", cmd);
579 static int actionListMk (struct lexSpec *spec, const char *s,
580 struct lexRuleAction **ap)
586 while ((tok = readParseToken (&s, &len)))
594 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
596 regxCodeMk (&(*ap)->u.code, s, len);
600 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
602 (*ap)->u.pattern.body = bodyMark;
604 (*ap)->u.pattern.dfa = lexSpecDFA ();
606 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
611 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
614 dfa_mkstate ((*ap)->u.pattern.dfa);
618 logf (LOG_WARN, "cannot use BEGIN here");
621 logf (LOG_WARN, "cannot use INIT here");
624 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
634 int readOneSpec (struct lexSpec *spec, const char *s)
638 struct lexContext *lc;
640 tok = readParseToken (&s, &len);
641 if (tok == REGX_CONTEXT)
643 char context_name[32];
644 tok = readParseToken (&s, &len);
645 if (tok != REGX_CODE)
647 logf (LOG_WARN, "missing name after CONTEXT keyword");
652 memcpy (context_name, s, len);
653 context_name[len] = '\0';
654 lc = lexContextCreate (context_name);
655 lc->next = spec->context;
660 spec->context = lexContextCreate ("main");
665 actionListDel (&spec->context->beginActionList);
666 actionListMk (spec, s, &spec->context->beginActionList);
669 actionListDel (&spec->context->endActionList);
670 actionListMk (spec, s, &spec->context->endActionList);
673 actionListDel (&spec->context->initActionList);
674 actionListMk (spec, s, &spec->context->initActionList);
678 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
680 r = dfa_parse (spec->context->dfa, &s);
683 logf (LOG_WARN, "regular expression error. r=%d", r);
688 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
692 rp = (struct lexRule *) xmalloc (sizeof(*rp));
693 rp->info.no = spec->context->ruleNo++;
694 rp->next = spec->context->rules;
695 spec->context->rules = rp;
696 actionListMk (spec, s, &rp->info.actionList);
701 int readFileSpec (struct lexSpec *spec)
703 struct lexContext *lc;
706 int c, i, errors = 0;
709 lineBuf = (char *) xmalloc (1+lineSize);
711 if (spec->tcl_interp)
713 sprintf (lineBuf, "%s.tflt", spec->name);
714 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
719 sprintf (lineBuf, "%s.flt", spec->name);
720 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
724 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
728 logf (LOG_LOG, "reading regx filter %s", lineBuf);
730 if (spec->tcl_interp)
731 logf (LOG_LOG, "Tcl enabled");
738 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
740 while (c != '\n' && c != EOF)
761 if (c != ' ' && c != '\t')
770 readOneSpec (spec, lineBuf);
771 spec->lineNo += addLine;
780 debug_dfa_followpos = 1;
783 for (lc = spec->context; lc; lc = lc->next)
786 lc->fastRule = (struct lexRuleInfo **)
787 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
788 for (i = 0; i < lc->ruleNo; i++)
789 lc->fastRule[i] = NULL;
790 for (rp = lc->rules; rp; rp = rp->next)
791 lc->fastRule[rp->info.no] = &rp->info;
792 dfa_mkstate (lc->dfa);
801 static struct lexSpec *curLexSpec = NULL;
804 static void execData (struct lexSpec *spec,
805 const char *ebuf, int elen, int formatted_text)
807 struct data1_node *res, *parent;
810 if (elen == 0) /* shouldn't happen, but it does! */
814 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
815 ebuf, 15, ebuf + elen-15);
817 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
819 logf (LOG_DEBUG, "data (%d bytes)", elen);
822 if (spec->d1_level <= 1)
825 parent = spec->d1_stack[spec->d1_level -1];
828 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
829 org_len = res->u.data.len;
834 res = data1_mk_node (spec->dh, spec->m);
835 res->parent = parent;
836 res->which = DATA1N_data;
837 res->u.data.what = DATA1I_text;
839 res->u.data.formatted_text = formatted_text;
841 if (elen > DATA1_LOCALDATA)
842 res->u.data.data = nmem_malloc (spec->m, elen);
844 res->u.data.data = res->lbuf;
845 memcpy (res->u.data.data, ebuf, elen);
847 res->u.data.data = 0;
849 res->root = parent->root;
851 parent->last_child = res;
852 if (spec->d1_stack[spec->d1_level])
853 spec->d1_stack[spec->d1_level]->next = res;
856 spec->d1_stack[spec->d1_level] = res;
858 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
860 char *old_buf, *new_buf;
862 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
863 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level]->max);
864 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
866 memcpy (new_buf, old_buf, org_len);
869 spec->concatBuf[spec->d1_level]->buf = new_buf;
871 assert (spec->concatBuf[spec->d1_level]);
872 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
873 res->u.data.len += elen;
876 static void execDataP (struct lexSpec *spec,
877 const char *ebuf, int elen, int formatted_text)
879 execData (spec, ebuf, elen, formatted_text);
882 static void tagDataRelease (struct lexSpec *spec)
886 if ((res = spec->d1_stack[spec->d1_level]) &&
887 res->which == DATA1N_data &&
888 res->u.data.what == DATA1I_text)
890 assert (!res->u.data.data);
891 assert (res->u.data.len > 0);
892 if (res->u.data.len > DATA1_LOCALDATA)
893 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
895 res->u.data.data = res->lbuf;
896 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
901 static void variantBegin (struct lexSpec *spec,
902 const char *class_str, int class_len,
903 const char *type_str, int type_len,
904 const char *value_str, int value_len)
906 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
907 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
912 if (spec->d1_level == 0)
914 logf (LOG_WARN, "in variant begin. No record type defined");
917 if (class_len >= DATA1_MAX_SYMBOL)
918 class_len = DATA1_MAX_SYMBOL-1;
919 memcpy (tclass, class_str, class_len);
920 tclass[class_len] = '\0';
922 if (type_len >= DATA1_MAX_SYMBOL)
923 type_len = DATA1_MAX_SYMBOL-1;
924 memcpy (ttype, type_str, type_len);
925 ttype[type_len] = '\0';
928 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
933 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
937 if (parent->which != DATA1N_variant)
939 res = data1_mk_node (spec->dh, spec->m);
940 res->parent = parent;
941 res->which = DATA1N_variant;
942 res->u.variant.type = 0;
943 res->u.variant.value = 0;
944 res->root = parent->root;
946 parent->last_child = res;
947 if (spec->d1_stack[spec->d1_level])
949 tagDataRelease (spec);
950 spec->d1_stack[spec->d1_level]->next = res;
954 spec->d1_stack[spec->d1_level] = res;
955 spec->d1_stack[++(spec->d1_level)] = NULL;
957 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
958 if (spec->d1_stack[i]->u.variant.type == tp)
965 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
967 parent = spec->d1_stack[spec->d1_level-1];
968 res = data1_mk_node (spec->dh, spec->m);
969 res->parent = parent;
970 res->which = DATA1N_variant;
971 res->root = parent->root;
972 res->u.variant.type = tp;
974 if (value_len >= DATA1_LOCALDATA)
975 value_len =DATA1_LOCALDATA-1;
976 memcpy (res->lbuf, value_str, value_len);
977 res->lbuf[value_len] = '\0';
979 res->u.variant.value = res->lbuf;
981 parent->last_child = res;
982 if (spec->d1_stack[spec->d1_level])
984 tagDataRelease (spec);
985 spec->d1_stack[spec->d1_level]->next = res;
989 spec->d1_stack[spec->d1_level] = res;
990 spec->d1_stack[++(spec->d1_level)] = NULL;
993 static void tagStrip (const char **tag, int *len)
997 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1000 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1006 static void tagBegin (struct lexSpec *spec,
1007 const char *tag, int len)
1009 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1010 data1_element *elem = NULL;
1011 data1_node *partag = get_parent_tag(spec->dh, parent);
1013 data1_element *e = NULL;
1016 if (spec->d1_level == 0)
1018 logf (LOG_WARN, "in element begin. No record type defined");
1021 tagStrip (&tag, &len);
1023 res = data1_mk_node (spec->dh, spec->m);
1024 res->parent = parent;
1025 res->which = DATA1N_tag;
1026 res->u.tag.get_bytes = -1;
1028 if (len >= DATA1_LOCALDATA)
1029 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1031 res->u.tag.tag = res->lbuf;
1033 memcpy (res->u.tag.tag, tag, len);
1034 res->u.tag.tag[len] = '\0';
1037 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1039 if (parent->which == DATA1N_variant)
1042 if (!(e = partag->u.tag.element))
1045 elem = data1_getelementbytagname (spec->dh,
1046 spec->d1_stack[0]->u.root.absyn,
1048 res->u.tag.element = elem;
1049 res->u.tag.node_selected = 0;
1050 res->u.tag.make_variantlist = 0;
1051 res->u.tag.no_data_requested = 0;
1052 res->root = parent->root;
1054 parent->last_child = res;
1055 if (spec->d1_stack[spec->d1_level])
1057 tagDataRelease (spec);
1058 spec->d1_stack[spec->d1_level]->next = res;
1061 parent->child = res;
1062 spec->d1_stack[spec->d1_level] = res;
1063 spec->d1_stack[++(spec->d1_level)] = NULL;
1066 static void tagEnd (struct lexSpec *spec, int min_level,
1067 const char *tag, int len)
1069 tagStrip (&tag, &len);
1070 while (spec->d1_level > min_level)
1072 tagDataRelease (spec);
1074 if (spec->d1_level == 0)
1076 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1078 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1080 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1084 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1089 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1092 struct DFA_state *state = dfa->states[0];
1095 unsigned char c_prev = 0;
1096 int ptr = *pptr; /* current pointer */
1097 int start_ptr = *pptr; /* first char of match */
1098 int last_ptr = 0; /* last char of match */
1099 int last_rule = 0; /* rule number of current match */
1104 c = f_win_advance (spec, &ptr);
1105 if (ptr == F_WIN_EOF)
1122 *mptr = start_ptr; /* match starts here */
1123 *pptr = last_ptr; /* match end here (+1) */
1126 state = dfa->states[0];
1131 else if (c >= t->ch[0] && c <= t->ch[1])
1133 state = dfa->states[t->to];
1138 last_rule = state->rule_no;
1143 last_rule = state->rule_nno;
1155 static int execTok (struct lexSpec *spec, const char **src,
1156 const char **tokBuf, int *tokLen)
1158 const char *s = *src;
1160 while (*s == ' ' || *s == '\t')
1164 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1168 while (*s >= '0' && *s <= '9')
1169 n = n*10 + (*s++ -'0');
1170 if (spec->arg_no == 0)
1177 if (n >= spec->arg_no)
1179 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1183 else if (*s == '\"')
1186 while (*s && *s != '\"')
1188 *tokLen = s - *tokBuf;
1193 else if (*s == '\n' || *s == ';')
1201 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1204 *tokLen = s - *tokBuf;
1211 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1214 *tokLen = s - *tokBuf;
1220 static char *regxStrz (const char *src, int len, char *str)
1224 memcpy (str, src, len);
1230 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1231 int argc, char **argv)
1233 struct lexSpec *spec = (struct lexSpec *) clientData;
1236 if (!strcmp(argv[1], "record") && argc == 3)
1238 char *absynName = argv[2];
1242 logf (LOG_DEBUG, "begin record %s", absynName);
1244 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1245 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1250 res = data1_mk_node (spec->dh, spec->m);
1251 res->which = DATA1N_root;
1252 res->u.root.type = absynName;
1253 res->u.root.absyn = absyn;
1256 spec->d1_stack[spec->d1_level] = res;
1257 spec->d1_stack[++(spec->d1_level)] = NULL;
1260 else if (!strcmp(argv[1], "element") && argc == 3)
1262 tagBegin (spec, argv[2], strlen(argv[2]));
1264 else if (!strcmp (argv[1], "variant") && argc == 5)
1266 variantBegin (spec, argv[2], strlen(argv[2]),
1267 argv[3], strlen(argv[3]),
1268 argv[4], strlen(argv[4]));
1270 else if (!strcmp (argv[1], "context") && argc == 3)
1272 struct lexContext *lc = spec->context;
1274 logf (LOG_DEBUG, "begin context %s",argv[2]);
1276 while (lc && strcmp (argv[2], lc->name))
1280 spec->context_stack[++(spec->context_stack_top)] = lc;
1283 logf (LOG_WARN, "unknown context %s", argv[2]);
1290 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1291 int argc, char **argv)
1293 struct lexSpec *spec = (struct lexSpec *) clientData;
1297 if (!strcmp (argv[1], "record"))
1299 while (spec->d1_level)
1301 tagDataRelease (spec);
1305 logf (LOG_DEBUG, "end record");
1307 spec->stop_flag = 1;
1309 else if (!strcmp (argv[1], "element"))
1313 if (argc >= 3 && !strcmp(argv[2], "-record"))
1322 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1323 if (spec->d1_level == 0)
1326 logf (LOG_DEBUG, "end element end records");
1328 spec->stop_flag = 1;
1331 else if (!strcmp (argv[1], "context"))
1334 logf (LOG_DEBUG, "end context");
1336 if (spec->context_stack_top)
1337 (spec->context_stack_top)--;
1344 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1345 int argc, char **argv)
1349 const char *element = 0;
1350 struct lexSpec *spec = (struct lexSpec *) clientData;
1354 if (!strcmp("-text", argv[argi]))
1359 else if (!strcmp("-element", argv[argi]))
1363 element = argv[argi++];
1369 tagBegin (spec, element, strlen(element));
1373 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1377 tagEnd (spec, 1, NULL, 0);
1381 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1382 int argc, char **argv)
1384 struct lexSpec *spec = (struct lexSpec *) clientData;
1391 if (!strcmp("-offset", argv[argi]))
1396 offset = atoi(argv[argi]);
1405 no = atoi(argv[argi]);
1406 if (no >= spec->arg_no)
1407 no = spec->arg_no - 1;
1408 spec->ptr = spec->arg_start[no] + offset;
1412 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1415 for (i = 0; i < spec->arg_no; i++)
1417 char var_name[10], *var_buf;
1420 sprintf (var_name, "%d", i);
1421 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1425 ch = var_buf[var_len];
1426 var_buf[var_len] = '\0';
1427 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1428 var_buf[var_len] = ch;
1431 if (Tcl_Eval (spec->tcl_interp, code->str) != TCL_OK)
1433 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1434 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1435 spec->tcl_interp->errorLine,
1436 spec->tcl_interp->result,
1437 err ? err : "[NO ERRORINFO]");
1444 static void execCode (struct lexSpec *spec, struct regxCode *code)
1446 const char *s = code->str;
1448 const char *cmd_str;
1450 r = execTok (spec, &s, &cmd_str, &cmd_len);
1457 r = execTok (spec, &s, &cmd_str, &cmd_len);
1460 p = regxStrz (cmd_str, cmd_len, ptmp);
1461 if (!strcmp (p, "begin"))
1463 r = execTok (spec, &s, &cmd_str, &cmd_len);
1466 logf (LOG_WARN, "missing keyword after 'begin'");
1469 p = regxStrz (cmd_str, cmd_len, ptmp);
1470 if (!strcmp (p, "record"))
1472 r = execTok (spec, &s, &cmd_str, &cmd_len);
1475 if (spec->d1_level == 0)
1477 static char absynName[64];
1482 memcpy (absynName, cmd_str, cmd_len);
1483 absynName[cmd_len] = '\0';
1486 logf (LOG_DEBUG, "begin record %s", absynName);
1488 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1489 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1494 res = data1_mk_node (spec->dh, spec->m);
1495 res->which = DATA1N_root;
1496 res->u.root.type = absynName;
1497 res->u.root.absyn = absyn;
1500 spec->d1_stack[spec->d1_level] = res;
1501 spec->d1_stack[++(spec->d1_level)] = NULL;
1504 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 else if (!strcmp (p, "element"))
1508 r = execTok (spec, &s, &cmd_str, &cmd_len);
1511 tagBegin (spec, cmd_str, cmd_len);
1512 r = execTok (spec, &s, &cmd_str, &cmd_len);
1514 else if (!strcmp (p, "variant"))
1517 const char *class_str = NULL;
1519 const char *type_str = NULL;
1521 const char *value_str = NULL;
1522 r = execTok (spec, &s, &cmd_str, &cmd_len);
1525 class_str = cmd_str;
1526 class_len = cmd_len;
1527 r = execTok (spec, &s, &cmd_str, &cmd_len);
1533 r = execTok (spec, &s, &cmd_str, &cmd_len);
1536 value_str = cmd_str;
1537 value_len = cmd_len;
1539 variantBegin (spec, class_str, class_len,
1540 type_str, type_len, value_str, value_len);
1543 r = execTok (spec, &s, &cmd_str, &cmd_len);
1545 else if (!strcmp (p, "context"))
1549 struct lexContext *lc = spec->context;
1550 r = execTok (spec, &s, &cmd_str, &cmd_len);
1551 p = regxStrz (cmd_str, cmd_len, ptmp);
1553 logf (LOG_DEBUG, "begin context %s", p);
1555 while (lc && strcmp (p, lc->name))
1558 spec->context_stack[++(spec->context_stack_top)] = lc;
1560 logf (LOG_WARN, "unknown context %s", p);
1563 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1570 else if (!strcmp (p, "end"))
1572 r = execTok (spec, &s, &cmd_str, &cmd_len);
1575 logf (LOG_WARN, "missing keyword after 'end'");
1578 p = regxStrz (cmd_str, cmd_len, ptmp);
1579 if (!strcmp (p, "record"))
1581 while (spec->d1_level)
1583 tagDataRelease (spec);
1586 r = execTok (spec, &s, &cmd_str, &cmd_len);
1588 logf (LOG_DEBUG, "end record");
1590 spec->stop_flag = 1;
1592 else if (!strcmp (p, "element"))
1595 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1597 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1602 tagEnd (spec, min_level, cmd_str, cmd_len);
1603 r = execTok (spec, &s, &cmd_str, &cmd_len);
1606 tagEnd (spec, min_level, NULL, 0);
1607 if (spec->d1_level == 0)
1610 logf (LOG_DEBUG, "end element end records");
1612 spec->stop_flag = 1;
1616 else if (!strcmp (p, "context"))
1619 logf (LOG_DEBUG, "end context");
1621 if (spec->context_stack_top)
1622 (spec->context_stack_top)--;
1623 r = execTok (spec, &s, &cmd_str, &cmd_len);
1626 logf (LOG_WARN, "bad keyword '%s' after end", p);
1628 else if (!strcmp (p, "data"))
1632 const char *element_str = NULL;
1634 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1636 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1638 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1640 r = execTok (spec, &s, &element_str, &element_len);
1645 logf (LOG_WARN, "bad data option: %.*s",
1650 logf (LOG_WARN, "missing data item after data");
1654 tagBegin (spec, element_str, element_len);
1657 execData (spec, cmd_str, cmd_len,textFlag);
1658 r = execTok (spec, &s, &cmd_str, &cmd_len);
1661 tagEnd (spec, 1, NULL, 0);
1663 else if (!strcmp (p, "unread"))
1666 r = execTok (spec, &s, &cmd_str, &cmd_len);
1667 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1669 r = execTok (spec, &s, &cmd_str, &cmd_len);
1672 logf (LOG_WARN, "missing number after -offset");
1675 p = regxStrz (cmd_str, cmd_len, ptmp);
1677 r = execTok (spec, &s, &cmd_str, &cmd_len);
1683 logf (LOG_WARN, "missing index after unread command");
1686 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1688 logf (LOG_WARN, "bad index after unread command");
1693 no = *cmd_str - '0';
1694 if (no >= spec->arg_no)
1695 no = spec->arg_no - 1;
1696 spec->ptr = spec->arg_start[no] + offset;
1698 r = execTok (spec, &s, &cmd_str, &cmd_len);
1700 else if (!strcmp (p, "context"))
1704 struct lexContext *lc = spec->context;
1705 r = execTok (spec, &s, &cmd_str, &cmd_len);
1706 p = regxStrz (cmd_str, cmd_len, ptmp);
1708 while (lc && strcmp (p, lc->name))
1711 spec->context_stack[spec->context_stack_top] = lc;
1713 logf (LOG_WARN, "unknown context %s", p);
1716 r = execTok (spec, &s, &cmd_str, &cmd_len);
1720 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1721 r = execTok (spec, &s, &cmd_str, &cmd_len);
1726 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1728 r = execTok (spec, &s, &cmd_str, &cmd_len);
1735 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1736 int start_ptr, int *pptr)
1745 arg_start[0] = start_ptr;
1747 spec->arg_start = arg_start;
1748 spec->arg_end = arg_end;
1755 if (ap->u.pattern.body)
1757 arg_start[arg_no] = *pptr;
1758 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1760 arg_end[arg_no] = F_WIN_EOF;
1762 arg_start[arg_no] = F_WIN_EOF;
1763 arg_end[arg_no] = F_WIN_EOF;
1768 arg_end[arg_no] = sptr;
1770 arg_start[arg_no] = sptr;
1771 arg_end[arg_no] = *pptr;
1776 arg_start[arg_no] = *pptr;
1777 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1779 if (sptr != arg_start[arg_no])
1781 arg_end[arg_no] = *pptr;
1786 spec->arg_no = arg_no;
1789 if (spec->tcl_interp)
1790 execTcl(spec, ap->u.code);
1792 execCode (spec, ap->u.code);
1794 execCode (spec, ap->u.code);
1797 if (spec->stop_flag)
1801 arg_start[arg_no] = *pptr;
1802 arg_end[arg_no] = F_WIN_EOF;
1811 static int execRule (struct lexSpec *spec, struct lexContext *context,
1812 int ruleNo, int start_ptr, int *pptr)
1815 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1817 return execAction (spec, context->fastRule[ruleNo]->actionList,
1821 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1823 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1824 struct DFA_state *state = context->dfa->states[0];
1827 unsigned char c_prev = '\n';
1829 int last_rule = 0; /* rule number of current match */
1830 int last_ptr = *ptr; /* last char of match */
1831 int start_ptr = *ptr; /* first char of match */
1832 int skip_ptr = *ptr; /* first char of run */
1836 c = f_win_advance (spec, ptr);
1837 if (*ptr == F_WIN_EOF)
1839 /* end of file met */
1842 /* there was a match */
1843 if (skip_ptr < start_ptr)
1845 /* deal with chars that didn't match */
1848 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1849 execDataP (spec, buf, size, 0);
1851 /* restore pointer */
1854 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1856 /* restore skip pointer */
1860 else if (skip_ptr < *ptr)
1862 /* deal with chars that didn't match */
1865 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1866 execDataP (spec, buf, size, 0);
1868 if (*ptr == F_WIN_EOF)
1875 { /* no transition for character c ... */
1878 if (skip_ptr < start_ptr)
1880 /* deal with chars that didn't match */
1883 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1884 execDataP (spec, buf, size, 0);
1886 /* restore pointer */
1888 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1890 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1893 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1895 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1899 context = spec->context_stack[spec->context_stack_top];
1902 last_ptr = start_ptr = *ptr;
1906 c_prev = f_win_advance (spec, &start_ptr);
1911 c_prev = f_win_advance (spec, &start_ptr);
1914 state = context->dfa->states[0];
1917 else if (c >= t->ch[0] && c <= t->ch[1])
1918 { /* transition ... */
1919 state = context->dfa->states[t->to];
1924 last_rule = state->rule_no;
1927 else if (state->rule_nno)
1929 last_rule = state->rule_nno;
1941 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1942 const char *context_name)
1944 struct lexContext *lt = spec->context;
1947 spec->stop_flag = 0;
1949 spec->context_stack_top = 0;
1952 if (!strcmp (lt->name, context_name))
1958 logf (LOG_WARN, "cannot find context %s", context_name);
1961 spec->context_stack[spec->context_stack_top] = lt;
1962 spec->d1_stack[spec->d1_level] = NULL;
1967 execAction (spec, lt->initActionList, ptr, &ptr);
1970 execAction (spec, lt->beginActionList, ptr, &ptr);
1971 lexNode (spec, &ptr);
1972 while (spec->d1_level)
1974 tagDataRelease (spec);
1977 execAction (spec, lt->endActionList, ptr, &ptr);
1978 return spec->d1_stack[0];
1981 void grs_destroy(void *clientData)
1983 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1986 lexSpecDestroy(&specs->spec);
1991 void *grs_init(void)
1993 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1998 data1_node *grs_read_regx (struct grs_read_info *p)
2001 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2002 struct lexSpec **curLexSpec = &specs->spec;
2005 logf (LOG_DEBUG, "grs_read_regx");
2007 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2010 lexSpecDestroy (curLexSpec);
2011 *curLexSpec = lexSpecCreate (p->type, p->dh);
2012 res = readFileSpec (*curLexSpec);
2015 lexSpecDestroy (curLexSpec);
2019 (*curLexSpec)->dh = p->dh;
2022 (*curLexSpec)->f_win_start = 0;
2023 (*curLexSpec)->f_win_end = 0;
2024 (*curLexSpec)->f_win_rf = p->readf;
2025 (*curLexSpec)->f_win_sf = p->seekf;
2026 (*curLexSpec)->f_win_fh = p->fh;
2027 (*curLexSpec)->f_win_ef = p->endf;
2028 (*curLexSpec)->f_win_size = 500000;
2030 (*curLexSpec)->m = p->mem;
2031 return lexRoot (*curLexSpec, p->offset, "main");
2034 static struct recTypeGrs regx_type = {
2041 RecTypeGrs recTypeGrs_regx = ®x_type;
2044 data1_node *grs_read_tcl (struct grs_read_info *p)
2047 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2048 struct lexSpec **curLexSpec = &specs->spec;
2051 logf (LOG_DEBUG, "grs_read_tcl");
2053 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2055 Tcl_Interp *tcl_interp;
2057 lexSpecDestroy (curLexSpec);
2058 *curLexSpec = lexSpecCreate (p->type, p->dh);
2059 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2060 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2061 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2062 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2063 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2065 res = readFileSpec (*curLexSpec);
2068 lexSpecDestroy (curLexSpec);
2072 (*curLexSpec)->dh = p->dh;
2075 (*curLexSpec)->f_win_start = 0;
2076 (*curLexSpec)->f_win_end = 0;
2077 (*curLexSpec)->f_win_rf = p->readf;
2078 (*curLexSpec)->f_win_sf = p->seekf;
2079 (*curLexSpec)->f_win_fh = p->fh;
2080 (*curLexSpec)->f_win_ef = p->endf;
2081 (*curLexSpec)->f_win_size = 500000;
2083 (*curLexSpec)->m = p->mem;
2084 return lexRoot (*curLexSpec, p->offset, "main");
2087 static struct recTypeGrs tcl_type = {
2094 RecTypeGrs recTypeGrs_tcl = &tcl_type;