2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.31 1999-07-14 13:05:29 adam
8 * Tcl filter works with objects when TCL is version 8 or later; filter
9 * works with strings otherwise (slow).
11 * Revision 1.30 1999/07/14 10:55:28 adam
14 * Revision 1.29 1999/07/12 07:27:54 adam
15 * Improved speed of Tcl processing. Fixed one memory leak.
17 * Revision 1.28 1999/07/06 12:26:04 adam
18 * Fixed filters so that MS-DOS CR is ignored.
20 * Revision 1.27 1999/06/28 13:25:40 quinn
21 * Improved diagnostics for Tcl
23 * Revision 1.26 1999/05/26 07:49:14 adam
26 * Revision 1.25 1999/05/25 12:33:32 adam
27 * Fixed bug in Tcl filter.
29 * Revision 1.24 1999/05/21 11:08:46 adam
30 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
31 * script so that it reads uninstalled Tcl source.
33 * Revision 1.23 1999/05/20 12:57:18 adam
34 * Implemented TCL filter. Updated recctrl system.
36 * Revision 1.22 1998/11/03 16:07:13 adam
39 * Revision 1.21 1998/11/03 15:43:39 adam
40 * Fixed bug introduced by previous commit.
42 * Revision 1.20 1998/11/03 14:51:28 adam
43 * Changed code so that it creates as few data1 nodes as possible.
45 * Revision 1.19 1998/11/03 10:22:39 adam
46 * Fixed memory leak that could occur for when large data1 node were
47 * concatenated. Data-type data1_nodes may have multiple nodes.
49 * Revision 1.18 1998/10/15 13:11:47 adam
50 * Added support for option -record for "end element". When specified
51 * end element will mark end-of-record when at outer-level.
53 * Revision 1.17 1998/07/01 10:13:51 adam
56 * Revision 1.16 1998/06/30 15:15:09 adam
57 * Tags are trimmed: white space removed before- and after the tag.
59 * Revision 1.15 1998/06/30 12:55:45 adam
62 * Revision 1.14 1998/03/05 08:41:00 adam
63 * Implemented rule contexts.
65 * Revision 1.13 1997/12/12 06:33:58 adam
66 * Fixed bug that showed up when multiple filter where used.
67 * Made one routine thread-safe.
69 * Revision 1.12 1997/11/18 10:03:24 adam
70 * Member num_children removed from data1_node.
72 * Revision 1.11 1997/11/06 11:41:01 adam
73 * Implemented "begin variant" for the sgml.regx filter.
75 * Revision 1.10 1997/10/31 12:36:12 adam
76 * Minor change that avoids compiler warning.
78 * Revision 1.9 1997/09/29 09:02:49 adam
79 * Fixed small bug (introduced by previous commit).
81 * Revision 1.8 1997/09/17 12:19:22 adam
82 * Zebra version corresponds to YAZ version 1.4.
83 * Changed Zebra server so that it doesn't depend on global common_resource.
85 * Revision 1.7 1997/07/15 16:33:07 adam
86 * Check for zero length in execData.
88 * Revision 1.6 1997/02/24 10:41:51 adam
89 * Cleanup of code and commented out the "end element-end-record" code.
91 * Revision 1.5 1997/02/19 16:22:33 adam
92 * Fixed "end element" to terminate record in outer-most level.
94 * Revision 1.4 1997/02/12 20:42:58 adam
95 * Changed some log messages.
97 * Revision 1.3 1996/11/08 14:05:33 adam
98 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
100 * Revision 1.2 1996/10/29 14:02:09 adam
101 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
102 * data1_get_tabpath is used.
104 * Revision 1.1 1996/10/11 10:57:30 adam
105 * New module recctrl. Used to manage records (extract/retrieval).
107 * Revision 1.24 1996/06/17 14:25:31 adam
108 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
110 * Revision 1.23 1996/06/04 10:19:00 adam
111 * Minor changes - removed include of ctype.h.
113 * Revision 1.22 1996/06/03 15:23:13 adam
114 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
116 * Revision 1.21 1996/05/14 16:58:38 adam
119 * Revision 1.20 1996/05/01 13:46:36 adam
120 * First work on multiple records in one file.
121 * New option, -offset, to the "unread" command in the filter module.
123 * Revision 1.19 1996/02/12 16:18:20 adam
124 * Yet another bug fix in implementation of unread command.
126 * Revision 1.18 1996/02/12 16:07:54 adam
127 * Bug fix in new unread command.
129 * Revision 1.17 1996/02/12 15:56:11 adam
130 * New code command: unread.
132 * Revision 1.16 1996/01/17 14:57:51 adam
133 * Prototype changed for reader functions in extract/retrieve. File
134 * is identified by 'void *' instead of 'int.
136 * Revision 1.15 1996/01/08 19:15:47 adam
137 * New input filter that works!
139 * Revision 1.14 1996/01/08 09:10:38 adam
140 * Yet another complete rework on this module.
142 * Revision 1.13 1995/12/15 17:21:50 adam
143 * This version is able to set data.formatted_text in data1-nodes.
145 * Revision 1.12 1995/12/15 16:20:10 adam
146 * The filter files (*.flt) are read from the path given by data1_tabpath.
148 * Revision 1.11 1995/12/15 12:35:16 adam
151 * Revision 1.10 1995/12/15 10:35:36 adam
154 * Revision 1.9 1995/12/14 16:38:48 adam
155 * Completely new attempt to make regular expression parsing.
157 * Revision 1.8 1995/12/13 17:16:59 adam
160 * Revision 1.7 1995/12/13 16:51:58 adam
161 * Modified to set last_child in data1_nodes.
162 * Uses destroy handler to free up data text nodes.
164 * Revision 1.6 1995/12/13 13:45:37 quinn
165 * Changed data1 to use nmem.
167 * Revision 1.5 1995/12/11 09:12:52 adam
168 * The rec_get function returns NULL if record doesn't exist - will
169 * happen in the server if the result set records have been deleted since
170 * the creation of the set (i.e. the search).
171 * The server saves a result temporarily if it is 'volatile', i.e. the
172 * set is register dependent.
174 * Revision 1.4 1995/12/05 16:57:40 adam
175 * More work on regular patterns.
177 * Revision 1.3 1995/12/05 09:37:09 adam
178 * One malloc was renamed to xmalloc.
180 * Revision 1.2 1995/12/04 17:59:24 adam
181 * More work on regular expression conversion.
183 * Revision 1.1 1995/12/04 14:25:30 adam
184 * Started work on regular expression parsed input to structured records.
193 #include <zebrautl.h>
200 #if MAJOR_VERSION >= 8
201 #define HAVE_TCL_OBJECTS
207 #define F_WIN_EOF 2000000000
211 #define REGX_PATTERN 1
216 #define REGX_CONTEXT 6
226 struct lexRuleAction {
230 struct DFA *dfa; /* REGX_PATTERN */
233 struct regxCode *code; /* REGX_CODE */
235 struct lexRuleAction *next;
240 struct lexRuleAction *actionList;
244 struct lexRuleInfo info;
245 struct lexRule *next;
251 struct lexRule *rules;
252 struct lexRuleInfo **fastRule;
256 struct lexRuleAction *beginActionList;
257 struct lexRuleAction *endActionList;
258 struct lexRuleAction *initActionList;
259 struct lexContext *next;
262 struct lexConcatBuf {
269 struct lexContext *context;
271 struct lexContext **context_stack;
272 int context_stack_size;
273 int context_stack_top;
279 Tcl_Interp *tcl_interp;
282 void (*f_win_ef)(void *, off_t);
284 int f_win_start; /* first byte of buffer is this file offset */
285 int f_win_end; /* last byte of buffer is this offset - 1 */
286 int f_win_size; /* size of buffer */
287 char *f_win_buf; /* buffer itself */
288 int (*f_win_rf)(void *, char *, size_t);
289 off_t (*f_win_sf)(void *, off_t);
291 struct lexConcatBuf *concatBuf;
293 data1_node **d1_stack;
304 struct lexSpec *spec;
307 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
310 int i, r, off = start_pos - spec->f_win_start;
312 if (off >= 0 && end_pos <= spec->f_win_end)
314 *size = end_pos - start_pos;
315 return spec->f_win_buf + off;
317 if (off < 0 || start_pos >= spec->f_win_end)
319 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
320 spec->f_win_start = start_pos;
322 if (!spec->f_win_buf)
323 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
324 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
326 spec->f_win_end = spec->f_win_start + *size;
328 if (*size > end_pos - start_pos)
329 *size = end_pos - start_pos;
330 return spec->f_win_buf;
332 for (i = 0; i<spec->f_win_end - start_pos; i++)
333 spec->f_win_buf[i] = spec->f_win_buf[i + off];
334 r = (*spec->f_win_rf)(spec->f_win_fh,
336 spec->f_win_size - i);
337 spec->f_win_start = start_pos;
338 spec->f_win_end += r;
340 if (*size > end_pos - start_pos)
341 *size = end_pos - start_pos;
342 return spec->f_win_buf;
345 static int f_win_advance (struct lexSpec *spec, int *pos)
350 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
351 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
352 if (*pos == F_WIN_EOF)
354 buf = f_win_get (spec, *pos, *pos+1, &size);
364 static void regxCodeDel (struct regxCode **pp)
366 struct regxCode *p = *pp;
371 Tcl_DecrRefCount (p->tcl_obj);
379 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
383 p = (struct regxCode *) xmalloc (sizeof(*p));
384 p->str = (char *) xmalloc (len+1);
385 memcpy (p->str, buf, len);
388 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
390 Tcl_IncrRefCount (p->tcl_obj);
395 static struct DFA *lexSpecDFA (void)
400 dfa_parse_cmap_del (dfa, ' ');
401 dfa_parse_cmap_del (dfa, '\t');
402 dfa_parse_cmap_add (dfa, '/', 0);
406 static void actionListDel (struct lexRuleAction **rap)
408 struct lexRuleAction *ra1, *ra;
410 for (ra = *rap; ra; ra = ra1)
416 dfa_delete (&ra->u.pattern.dfa);
419 regxCodeDel (&ra->u.code);
427 static struct lexContext *lexContextCreate (const char *name)
429 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
431 p->name = xstrdup (name);
434 p->dfa = lexSpecDFA ();
437 p->beginActionList = NULL;
438 p->endActionList = NULL;
439 p->initActionList = NULL;
444 static void lexContextDestroy (struct lexContext *p)
446 struct lexRule *rp, *rp1;
448 dfa_delete (&p->dfa);
450 for (rp = p->rules; rp; rp = rp1)
453 actionListDel (&rp->info.actionList);
456 actionListDel (&p->beginActionList);
457 actionListDel (&p->endActionList);
458 actionListDel (&p->initActionList);
463 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
468 p = (struct lexSpec *) xmalloc (sizeof(*p));
469 p->name = (char *) xmalloc (strlen(name)+1);
470 strcpy (p->name, name);
477 p->context_stack_size = 100;
478 p->context_stack = (struct lexContext **)
479 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
483 p->concatBuf = (struct lexConcatBuf *)
484 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
485 for (i = 0; i < p->maxLevel; i++)
487 p->concatBuf[i].max = 0;
488 p->concatBuf[i].buf = 0;
490 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
495 static void lexSpecDestroy (struct lexSpec **pp)
498 struct lexContext *lt;
506 for (i = 0; i < p->maxLevel; i++)
507 xfree (p->concatBuf[i].buf);
508 xfree (p->concatBuf);
513 struct lexContext *lt_next = lt->next;
514 lexContextDestroy (lt);
519 Tcl_DeleteInterp (p->tcl_interp);
522 xfree (p->f_win_buf);
523 xfree (p->context_stack);
529 static int readParseToken (const char **cpp, int *len)
531 const char *cp = *cpp;
535 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
564 if (*cp >= 'a' && *cp <= 'z')
566 else if (*cp >= 'A' && *cp <= 'Z')
567 cmd[i] = *cp + 'a' - 'A';
570 if (i < (int) sizeof(cmd)-2)
577 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
579 while (*cp && *cp != ' ' && *cp != '\t' &&
580 *cp != '\n' && *cp != '\r')
586 if (!strcmp (cmd, "begin"))
588 else if (!strcmp (cmd, "end"))
590 else if (!strcmp (cmd, "body"))
592 else if (!strcmp (cmd, "context"))
594 else if (!strcmp (cmd, "init"))
598 logf (LOG_WARN, "bad command %s", cmd);
604 static int actionListMk (struct lexSpec *spec, const char *s,
605 struct lexRuleAction **ap)
611 while ((tok = readParseToken (&s, &len)))
619 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
621 regxCodeMk (&(*ap)->u.code, s, len);
625 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
627 (*ap)->u.pattern.body = bodyMark;
629 (*ap)->u.pattern.dfa = lexSpecDFA ();
631 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
636 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
639 dfa_mkstate ((*ap)->u.pattern.dfa);
643 logf (LOG_WARN, "cannot use BEGIN here");
646 logf (LOG_WARN, "cannot use INIT here");
649 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
659 int readOneSpec (struct lexSpec *spec, const char *s)
663 struct lexContext *lc;
665 tok = readParseToken (&s, &len);
666 if (tok == REGX_CONTEXT)
668 char context_name[32];
669 tok = readParseToken (&s, &len);
670 if (tok != REGX_CODE)
672 logf (LOG_WARN, "missing name after CONTEXT keyword");
677 memcpy (context_name, s, len);
678 context_name[len] = '\0';
679 lc = lexContextCreate (context_name);
680 lc->next = spec->context;
685 spec->context = lexContextCreate ("main");
690 actionListDel (&spec->context->beginActionList);
691 actionListMk (spec, s, &spec->context->beginActionList);
694 actionListDel (&spec->context->endActionList);
695 actionListMk (spec, s, &spec->context->endActionList);
698 actionListDel (&spec->context->initActionList);
699 actionListMk (spec, s, &spec->context->initActionList);
703 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
705 r = dfa_parse (spec->context->dfa, &s);
708 logf (LOG_WARN, "regular expression error. r=%d", r);
713 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
717 rp = (struct lexRule *) xmalloc (sizeof(*rp));
718 rp->info.no = spec->context->ruleNo++;
719 rp->next = spec->context->rules;
720 spec->context->rules = rp;
721 actionListMk (spec, s, &rp->info.actionList);
726 int readFileSpec (struct lexSpec *spec)
728 struct lexContext *lc;
729 int c, i, errors = 0;
735 if (spec->tcl_interp)
737 sprintf (fname, "%s.tflt", spec->name);
738 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
743 sprintf (fname, "%s.flt", spec->name);
744 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
748 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
751 logf (LOG_LOG, "reading regx filter %s", fname);
753 if (spec->tcl_interp)
754 logf (LOG_LOG, "Tcl enabled");
756 lineBuf = wrbuf_alloc();
761 wrbuf_rewind (lineBuf);
762 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
764 while (c != '\n' && c != EOF)
777 wrbuf_putc(lineBuf, c);
785 if (c != ' ' && c != '\t')
790 wrbuf_putc(lineBuf, '\0');
791 readOneSpec (spec, wrbuf_buf(lineBuf));
792 spec->lineNo += addLine;
796 wrbuf_free(lineBuf, 1);
801 debug_dfa_followpos = 1;
804 for (lc = spec->context; lc; lc = lc->next)
807 lc->fastRule = (struct lexRuleInfo **)
808 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
809 for (i = 0; i < lc->ruleNo; i++)
810 lc->fastRule[i] = NULL;
811 for (rp = lc->rules; rp; rp = rp->next)
812 lc->fastRule[rp->info.no] = &rp->info;
813 dfa_mkstate (lc->dfa);
822 static struct lexSpec *curLexSpec = NULL;
825 static void execData (struct lexSpec *spec,
826 const char *ebuf, int elen, int formatted_text)
828 struct data1_node *res, *parent;
831 if (elen == 0) /* shouldn't happen, but it does! */
835 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
836 ebuf, 15, ebuf + elen-15);
838 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
840 logf (LOG_DEBUG, "data (%d bytes)", elen);
843 if (spec->d1_level <= 1)
846 parent = spec->d1_stack[spec->d1_level -1];
849 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
850 org_len = res->u.data.len;
855 res = data1_mk_node (spec->dh, spec->m);
856 res->parent = parent;
857 res->which = DATA1N_data;
858 res->u.data.what = DATA1I_text;
860 res->u.data.formatted_text = formatted_text;
862 if (elen > DATA1_LOCALDATA)
863 res->u.data.data = nmem_malloc (spec->m, elen);
865 res->u.data.data = res->lbuf;
866 memcpy (res->u.data.data, ebuf, elen);
868 res->u.data.data = 0;
870 res->root = parent->root;
872 parent->last_child = res;
873 if (spec->d1_stack[spec->d1_level])
874 spec->d1_stack[spec->d1_level]->next = res;
877 spec->d1_stack[spec->d1_level] = res;
879 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
881 char *old_buf, *new_buf;
883 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
884 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
885 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
887 memcpy (new_buf, old_buf, org_len);
890 spec->concatBuf[spec->d1_level].buf = new_buf;
892 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
893 res->u.data.len += elen;
896 static void execDataP (struct lexSpec *spec,
897 const char *ebuf, int elen, int formatted_text)
899 execData (spec, ebuf, elen, formatted_text);
902 static void tagDataRelease (struct lexSpec *spec)
906 if ((res = spec->d1_stack[spec->d1_level]) &&
907 res->which == DATA1N_data &&
908 res->u.data.what == DATA1I_text)
910 assert (!res->u.data.data);
911 assert (res->u.data.len > 0);
912 if (res->u.data.len > DATA1_LOCALDATA)
913 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
915 res->u.data.data = res->lbuf;
916 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
921 static void variantBegin (struct lexSpec *spec,
922 const char *class_str, int class_len,
923 const char *type_str, int type_len,
924 const char *value_str, int value_len)
926 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
927 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
932 if (spec->d1_level == 0)
934 logf (LOG_WARN, "in variant begin. No record type defined");
937 if (class_len >= DATA1_MAX_SYMBOL)
938 class_len = DATA1_MAX_SYMBOL-1;
939 memcpy (tclass, class_str, class_len);
940 tclass[class_len] = '\0';
942 if (type_len >= DATA1_MAX_SYMBOL)
943 type_len = DATA1_MAX_SYMBOL-1;
944 memcpy (ttype, type_str, type_len);
945 ttype[type_len] = '\0';
948 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
953 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
957 if (parent->which != DATA1N_variant)
959 res = data1_mk_node (spec->dh, spec->m);
960 res->parent = parent;
961 res->which = DATA1N_variant;
962 res->u.variant.type = 0;
963 res->u.variant.value = 0;
964 res->root = parent->root;
966 parent->last_child = res;
967 if (spec->d1_stack[spec->d1_level])
969 tagDataRelease (spec);
970 spec->d1_stack[spec->d1_level]->next = res;
974 spec->d1_stack[spec->d1_level] = res;
975 spec->d1_stack[++(spec->d1_level)] = NULL;
977 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
978 if (spec->d1_stack[i]->u.variant.type == tp)
985 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
987 parent = spec->d1_stack[spec->d1_level-1];
988 res = data1_mk_node (spec->dh, spec->m);
989 res->parent = parent;
990 res->which = DATA1N_variant;
991 res->root = parent->root;
992 res->u.variant.type = tp;
994 if (value_len >= DATA1_LOCALDATA)
995 value_len =DATA1_LOCALDATA-1;
996 memcpy (res->lbuf, value_str, value_len);
997 res->lbuf[value_len] = '\0';
999 res->u.variant.value = res->lbuf;
1001 parent->last_child = res;
1002 if (spec->d1_stack[spec->d1_level])
1004 tagDataRelease (spec);
1005 spec->d1_stack[spec->d1_level]->next = res;
1008 parent->child = res;
1009 spec->d1_stack[spec->d1_level] = res;
1010 spec->d1_stack[++(spec->d1_level)] = NULL;
1013 static void tagStrip (const char **tag, int *len)
1017 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1020 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1026 static void tagBegin (struct lexSpec *spec,
1027 const char *tag, int len)
1029 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1030 data1_element *elem = NULL;
1031 data1_node *partag = get_parent_tag(spec->dh, parent);
1033 data1_element *e = NULL;
1036 if (spec->d1_level == 0)
1038 logf (LOG_WARN, "in element begin. No record type defined");
1041 tagStrip (&tag, &len);
1043 res = data1_mk_node (spec->dh, spec->m);
1044 res->parent = parent;
1045 res->which = DATA1N_tag;
1046 res->u.tag.get_bytes = -1;
1048 if (len >= DATA1_LOCALDATA)
1049 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1051 res->u.tag.tag = res->lbuf;
1053 memcpy (res->u.tag.tag, tag, len);
1054 res->u.tag.tag[len] = '\0';
1057 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1059 if (parent->which == DATA1N_variant)
1062 if (!(e = partag->u.tag.element))
1065 elem = data1_getelementbytagname (spec->dh,
1066 spec->d1_stack[0]->u.root.absyn,
1068 res->u.tag.element = elem;
1069 res->u.tag.node_selected = 0;
1070 res->u.tag.make_variantlist = 0;
1071 res->u.tag.no_data_requested = 0;
1072 res->root = parent->root;
1074 parent->last_child = res;
1075 if (spec->d1_stack[spec->d1_level])
1077 tagDataRelease (spec);
1078 spec->d1_stack[spec->d1_level]->next = res;
1081 parent->child = res;
1082 spec->d1_stack[spec->d1_level] = res;
1083 spec->d1_stack[++(spec->d1_level)] = NULL;
1086 static void tagEnd (struct lexSpec *spec, int min_level,
1087 const char *tag, int len)
1089 tagStrip (&tag, &len);
1090 while (spec->d1_level > min_level)
1092 tagDataRelease (spec);
1094 if (spec->d1_level == 0)
1096 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1098 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1100 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1104 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1109 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1112 struct DFA_state *state = dfa->states[0];
1115 unsigned char c_prev = 0;
1116 int ptr = *pptr; /* current pointer */
1117 int start_ptr = *pptr; /* first char of match */
1118 int last_ptr = 0; /* last char of match */
1119 int last_rule = 0; /* rule number of current match */
1124 c = f_win_advance (spec, &ptr);
1125 if (ptr == F_WIN_EOF)
1142 *mptr = start_ptr; /* match starts here */
1143 *pptr = last_ptr; /* match end here (+1) */
1146 state = dfa->states[0];
1151 else if (c >= t->ch[0] && c <= t->ch[1])
1153 state = dfa->states[t->to];
1158 last_rule = state->rule_no;
1163 last_rule = state->rule_nno;
1175 static int execTok (struct lexSpec *spec, const char **src,
1176 const char **tokBuf, int *tokLen)
1178 const char *s = *src;
1180 while (*s == ' ' || *s == '\t')
1184 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1188 while (*s >= '0' && *s <= '9')
1189 n = n*10 + (*s++ -'0');
1190 if (spec->arg_no == 0)
1197 if (n >= spec->arg_no)
1199 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1203 else if (*s == '\"')
1206 while (*s && *s != '\"')
1208 *tokLen = s - *tokBuf;
1213 else if (*s == '\n' || *s == ';')
1221 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1224 *tokLen = s - *tokBuf;
1231 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1234 *tokLen = s - *tokBuf;
1240 static char *regxStrz (const char *src, int len, char *str)
1244 memcpy (str, src, len);
1250 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1251 int argc, char **argv)
1253 struct lexSpec *spec = (struct lexSpec *) clientData;
1256 if (!strcmp(argv[1], "record") && argc == 3)
1258 char *absynName = argv[2];
1262 logf (LOG_DEBUG, "begin record %s", absynName);
1264 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1265 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1270 res = data1_mk_node (spec->dh, spec->m);
1271 res->which = DATA1N_root;
1272 res->u.root.type = absynName;
1273 res->u.root.absyn = absyn;
1276 spec->d1_stack[spec->d1_level] = res;
1277 spec->d1_stack[++(spec->d1_level)] = NULL;
1280 else if (!strcmp(argv[1], "element") && argc == 3)
1282 tagBegin (spec, argv[2], strlen(argv[2]));
1284 else if (!strcmp (argv[1], "variant") && argc == 5)
1286 variantBegin (spec, argv[2], strlen(argv[2]),
1287 argv[3], strlen(argv[3]),
1288 argv[4], strlen(argv[4]));
1290 else if (!strcmp (argv[1], "context") && argc == 3)
1292 struct lexContext *lc = spec->context;
1294 logf (LOG_DEBUG, "begin context %s",argv[2]);
1296 while (lc && strcmp (argv[2], lc->name))
1300 spec->context_stack[++(spec->context_stack_top)] = lc;
1303 logf (LOG_WARN, "unknown context %s", argv[2]);
1310 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1311 int argc, char **argv)
1313 struct lexSpec *spec = (struct lexSpec *) clientData;
1317 if (!strcmp (argv[1], "record"))
1319 while (spec->d1_level)
1321 tagDataRelease (spec);
1325 logf (LOG_DEBUG, "end record");
1327 spec->stop_flag = 1;
1329 else if (!strcmp (argv[1], "element"))
1333 if (argc >= 3 && !strcmp(argv[2], "-record"))
1342 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1343 if (spec->d1_level == 0)
1346 logf (LOG_DEBUG, "end element end records");
1348 spec->stop_flag = 1;
1351 else if (!strcmp (argv[1], "context"))
1354 logf (LOG_DEBUG, "end context");
1356 if (spec->context_stack_top)
1357 (spec->context_stack_top)--;
1364 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1365 int argc, char **argv)
1369 const char *element = 0;
1370 struct lexSpec *spec = (struct lexSpec *) clientData;
1374 if (!strcmp("-text", argv[argi]))
1379 else if (!strcmp("-element", argv[argi]))
1383 element = argv[argi++];
1389 tagBegin (spec, element, strlen(element));
1393 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1397 tagEnd (spec, 1, NULL, 0);
1401 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1402 int argc, char **argv)
1404 struct lexSpec *spec = (struct lexSpec *) clientData;
1411 if (!strcmp("-offset", argv[argi]))
1416 offset = atoi(argv[argi]);
1425 no = atoi(argv[argi]);
1426 if (no >= spec->arg_no)
1427 no = spec->arg_no - 1;
1428 spec->ptr = spec->arg_start[no] + offset;
1432 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1436 for (i = 0; i < spec->arg_no; i++)
1438 char var_name[10], *var_buf;
1441 sprintf (var_name, "%d", i);
1442 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1446 ch = var_buf[var_len];
1447 var_buf[var_len] = '\0';
1448 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1449 var_buf[var_len] = ch;
1452 #if HAVE_TCL_OBJECTS
1453 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1455 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1459 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1460 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1461 spec->tcl_interp->errorLine,
1462 spec->tcl_interp->result,
1463 err ? err : "[NO ERRORINFO]");
1469 static void execCode (struct lexSpec *spec, struct regxCode *code)
1471 const char *s = code->str;
1473 const char *cmd_str;
1475 r = execTok (spec, &s, &cmd_str, &cmd_len);
1482 r = execTok (spec, &s, &cmd_str, &cmd_len);
1485 p = regxStrz (cmd_str, cmd_len, ptmp);
1486 if (!strcmp (p, "begin"))
1488 r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 logf (LOG_WARN, "missing keyword after 'begin'");
1494 p = regxStrz (cmd_str, cmd_len, ptmp);
1495 if (!strcmp (p, "record"))
1497 r = execTok (spec, &s, &cmd_str, &cmd_len);
1500 if (spec->d1_level == 0)
1502 static char absynName[64];
1507 memcpy (absynName, cmd_str, cmd_len);
1508 absynName[cmd_len] = '\0';
1511 logf (LOG_DEBUG, "begin record %s", absynName);
1513 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1514 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1519 res = data1_mk_node (spec->dh, spec->m);
1520 res->which = DATA1N_root;
1521 res->u.root.type = absynName;
1522 res->u.root.absyn = absyn;
1525 spec->d1_stack[spec->d1_level] = res;
1526 spec->d1_stack[++(spec->d1_level)] = NULL;
1529 r = execTok (spec, &s, &cmd_str, &cmd_len);
1531 else if (!strcmp (p, "element"))
1533 r = execTok (spec, &s, &cmd_str, &cmd_len);
1536 tagBegin (spec, cmd_str, cmd_len);
1537 r = execTok (spec, &s, &cmd_str, &cmd_len);
1539 else if (!strcmp (p, "variant"))
1542 const char *class_str = NULL;
1544 const char *type_str = NULL;
1546 const char *value_str = NULL;
1547 r = execTok (spec, &s, &cmd_str, &cmd_len);
1550 class_str = cmd_str;
1551 class_len = cmd_len;
1552 r = execTok (spec, &s, &cmd_str, &cmd_len);
1558 r = execTok (spec, &s, &cmd_str, &cmd_len);
1561 value_str = cmd_str;
1562 value_len = cmd_len;
1564 variantBegin (spec, class_str, class_len,
1565 type_str, type_len, value_str, value_len);
1568 r = execTok (spec, &s, &cmd_str, &cmd_len);
1570 else if (!strcmp (p, "context"))
1574 struct lexContext *lc = spec->context;
1575 r = execTok (spec, &s, &cmd_str, &cmd_len);
1576 p = regxStrz (cmd_str, cmd_len, ptmp);
1578 logf (LOG_DEBUG, "begin context %s", p);
1580 while (lc && strcmp (p, lc->name))
1583 spec->context_stack[++(spec->context_stack_top)] = lc;
1585 logf (LOG_WARN, "unknown context %s", p);
1588 r = execTok (spec, &s, &cmd_str, &cmd_len);
1592 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1595 else if (!strcmp (p, "end"))
1597 r = execTok (spec, &s, &cmd_str, &cmd_len);
1600 logf (LOG_WARN, "missing keyword after 'end'");
1603 p = regxStrz (cmd_str, cmd_len, ptmp);
1604 if (!strcmp (p, "record"))
1606 while (spec->d1_level)
1608 tagDataRelease (spec);
1611 r = execTok (spec, &s, &cmd_str, &cmd_len);
1613 logf (LOG_DEBUG, "end record");
1615 spec->stop_flag = 1;
1617 else if (!strcmp (p, "element"))
1620 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1622 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1627 tagEnd (spec, min_level, cmd_str, cmd_len);
1628 r = execTok (spec, &s, &cmd_str, &cmd_len);
1631 tagEnd (spec, min_level, NULL, 0);
1632 if (spec->d1_level == 0)
1635 logf (LOG_DEBUG, "end element end records");
1637 spec->stop_flag = 1;
1641 else if (!strcmp (p, "context"))
1644 logf (LOG_DEBUG, "end context");
1646 if (spec->context_stack_top)
1647 (spec->context_stack_top)--;
1648 r = execTok (spec, &s, &cmd_str, &cmd_len);
1651 logf (LOG_WARN, "bad keyword '%s' after end", p);
1653 else if (!strcmp (p, "data"))
1657 const char *element_str = NULL;
1659 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1661 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1663 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1665 r = execTok (spec, &s, &element_str, &element_len);
1670 logf (LOG_WARN, "bad data option: %.*s",
1675 logf (LOG_WARN, "missing data item after data");
1679 tagBegin (spec, element_str, element_len);
1682 execData (spec, cmd_str, cmd_len,textFlag);
1683 r = execTok (spec, &s, &cmd_str, &cmd_len);
1686 tagEnd (spec, 1, NULL, 0);
1688 else if (!strcmp (p, "unread"))
1691 r = execTok (spec, &s, &cmd_str, &cmd_len);
1692 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1694 r = execTok (spec, &s, &cmd_str, &cmd_len);
1697 logf (LOG_WARN, "missing number after -offset");
1700 p = regxStrz (cmd_str, cmd_len, ptmp);
1702 r = execTok (spec, &s, &cmd_str, &cmd_len);
1708 logf (LOG_WARN, "missing index after unread command");
1711 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1713 logf (LOG_WARN, "bad index after unread command");
1718 no = *cmd_str - '0';
1719 if (no >= spec->arg_no)
1720 no = spec->arg_no - 1;
1721 spec->ptr = spec->arg_start[no] + offset;
1723 r = execTok (spec, &s, &cmd_str, &cmd_len);
1725 else if (!strcmp (p, "context"))
1729 struct lexContext *lc = spec->context;
1730 r = execTok (spec, &s, &cmd_str, &cmd_len);
1731 p = regxStrz (cmd_str, cmd_len, ptmp);
1733 while (lc && strcmp (p, lc->name))
1736 spec->context_stack[spec->context_stack_top] = lc;
1738 logf (LOG_WARN, "unknown context %s", p);
1741 r = execTok (spec, &s, &cmd_str, &cmd_len);
1745 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1746 r = execTok (spec, &s, &cmd_str, &cmd_len);
1751 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1753 r = execTok (spec, &s, &cmd_str, &cmd_len);
1760 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1761 int start_ptr, int *pptr)
1770 arg_start[0] = start_ptr;
1772 spec->arg_start = arg_start;
1773 spec->arg_end = arg_end;
1780 if (ap->u.pattern.body)
1782 arg_start[arg_no] = *pptr;
1783 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1785 arg_end[arg_no] = F_WIN_EOF;
1787 arg_start[arg_no] = F_WIN_EOF;
1788 arg_end[arg_no] = F_WIN_EOF;
1793 arg_end[arg_no] = sptr;
1795 arg_start[arg_no] = sptr;
1796 arg_end[arg_no] = *pptr;
1801 arg_start[arg_no] = *pptr;
1802 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1804 if (sptr != arg_start[arg_no])
1806 arg_end[arg_no] = *pptr;
1811 spec->arg_no = arg_no;
1814 if (spec->tcl_interp)
1815 execTcl(spec, ap->u.code);
1817 execCode (spec, ap->u.code);
1819 execCode (spec, ap->u.code);
1822 if (spec->stop_flag)
1826 arg_start[arg_no] = *pptr;
1827 arg_end[arg_no] = F_WIN_EOF;
1836 static int execRule (struct lexSpec *spec, struct lexContext *context,
1837 int ruleNo, int start_ptr, int *pptr)
1840 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1842 return execAction (spec, context->fastRule[ruleNo]->actionList,
1846 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1848 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1849 struct DFA_state *state = context->dfa->states[0];
1852 unsigned char c_prev = '\n';
1854 int last_rule = 0; /* rule number of current match */
1855 int last_ptr = *ptr; /* last char of match */
1856 int start_ptr = *ptr; /* first char of match */
1857 int skip_ptr = *ptr; /* first char of run */
1861 c = f_win_advance (spec, ptr);
1862 if (*ptr == F_WIN_EOF)
1864 /* end of file met */
1867 /* there was a match */
1868 if (skip_ptr < start_ptr)
1870 /* deal with chars that didn't match */
1873 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1874 execDataP (spec, buf, size, 0);
1876 /* restore pointer */
1879 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1881 /* restore skip pointer */
1885 else if (skip_ptr < *ptr)
1887 /* deal with chars that didn't match */
1890 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1891 execDataP (spec, buf, size, 0);
1893 if (*ptr == F_WIN_EOF)
1900 { /* no transition for character c ... */
1903 if (skip_ptr < start_ptr)
1905 /* deal with chars that didn't match */
1908 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1909 execDataP (spec, buf, size, 0);
1911 /* restore pointer */
1913 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1915 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1918 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1920 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1924 context = spec->context_stack[spec->context_stack_top];
1927 last_ptr = start_ptr = *ptr;
1931 c_prev = f_win_advance (spec, &start_ptr);
1936 c_prev = f_win_advance (spec, &start_ptr);
1939 state = context->dfa->states[0];
1942 else if (c >= t->ch[0] && c <= t->ch[1])
1943 { /* transition ... */
1944 state = context->dfa->states[t->to];
1949 last_rule = state->rule_no;
1952 else if (state->rule_nno)
1954 last_rule = state->rule_nno;
1966 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1967 const char *context_name)
1969 struct lexContext *lt = spec->context;
1972 spec->stop_flag = 0;
1974 spec->context_stack_top = 0;
1977 if (!strcmp (lt->name, context_name))
1983 logf (LOG_WARN, "cannot find context %s", context_name);
1986 spec->context_stack[spec->context_stack_top] = lt;
1987 spec->d1_stack[spec->d1_level] = NULL;
1992 execAction (spec, lt->initActionList, ptr, &ptr);
1995 execAction (spec, lt->beginActionList, ptr, &ptr);
1996 lexNode (spec, &ptr);
1997 while (spec->d1_level)
1999 tagDataRelease (spec);
2002 execAction (spec, lt->endActionList, ptr, &ptr);
2003 return spec->d1_stack[0];
2006 void grs_destroy(void *clientData)
2008 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2011 lexSpecDestroy(&specs->spec);
2016 void *grs_init(void)
2018 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2023 data1_node *grs_read_regx (struct grs_read_info *p)
2026 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2027 struct lexSpec **curLexSpec = &specs->spec;
2030 logf (LOG_DEBUG, "grs_read_regx");
2032 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2035 lexSpecDestroy (curLexSpec);
2036 *curLexSpec = lexSpecCreate (p->type, p->dh);
2037 res = readFileSpec (*curLexSpec);
2040 lexSpecDestroy (curLexSpec);
2044 (*curLexSpec)->dh = p->dh;
2047 (*curLexSpec)->f_win_start = 0;
2048 (*curLexSpec)->f_win_end = 0;
2049 (*curLexSpec)->f_win_rf = p->readf;
2050 (*curLexSpec)->f_win_sf = p->seekf;
2051 (*curLexSpec)->f_win_fh = p->fh;
2052 (*curLexSpec)->f_win_ef = p->endf;
2053 (*curLexSpec)->f_win_size = 500000;
2055 (*curLexSpec)->m = p->mem;
2056 return lexRoot (*curLexSpec, p->offset, "main");
2059 static struct recTypeGrs regx_type = {
2066 RecTypeGrs recTypeGrs_regx = ®x_type;
2069 data1_node *grs_read_tcl (struct grs_read_info *p)
2072 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2073 struct lexSpec **curLexSpec = &specs->spec;
2076 logf (LOG_DEBUG, "grs_read_tcl");
2078 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2080 Tcl_Interp *tcl_interp;
2082 lexSpecDestroy (curLexSpec);
2083 *curLexSpec = lexSpecCreate (p->type, p->dh);
2084 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2085 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2086 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2087 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2088 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2090 res = readFileSpec (*curLexSpec);
2093 lexSpecDestroy (curLexSpec);
2097 (*curLexSpec)->dh = p->dh;
2100 (*curLexSpec)->f_win_start = 0;
2101 (*curLexSpec)->f_win_end = 0;
2102 (*curLexSpec)->f_win_rf = p->readf;
2103 (*curLexSpec)->f_win_sf = p->seekf;
2104 (*curLexSpec)->f_win_fh = p->fh;
2105 (*curLexSpec)->f_win_ef = p->endf;
2106 (*curLexSpec)->f_win_size = 500000;
2108 (*curLexSpec)->m = p->mem;
2109 return lexRoot (*curLexSpec, p->offset, "main");
2112 static struct recTypeGrs tcl_type = {
2119 RecTypeGrs recTypeGrs_tcl = &tcl_type;