2 * Copyright (C) 1994-2001, Index Data
6 * Revision 1.37 2001-05-29 08:51:59 adam
7 * More fixes for character encodings.
9 * Revision 1.36 2001/05/22 21:02:26 adam
10 * Fixes for Tcl UTF8 character handling.
12 * Revision 1.35 2001/03/29 21:31:31 adam
13 * Fixed "record begin" for Tcl filter.
15 * Revision 1.34 2000/11/29 14:24:01 adam
16 * Script configure uses yaz pthreads options. Added locking for
17 * zebra_register_{lock,unlock}.
19 * Revision 1.33 1999/11/30 13:48:04 adam
20 * Improved installation. Updated for inclusion of YAZ header files.
22 * Revision 1.32 1999/09/07 07:19:21 adam
23 * Work on character mapping. Implemented replace rules.
25 * Revision 1.31 1999/07/14 13:05:29 adam
26 * Tcl filter works with objects when TCL is version 8 or later; filter
27 * works with strings otherwise (slow).
29 * Revision 1.30 1999/07/14 10:55:28 adam
32 * Revision 1.29 1999/07/12 07:27:54 adam
33 * Improved speed of Tcl processing. Fixed one memory leak.
35 * Revision 1.28 1999/07/06 12:26:04 adam
36 * Fixed filters so that MS-DOS CR is ignored.
38 * Revision 1.27 1999/06/28 13:25:40 quinn
39 * Improved diagnostics for Tcl
41 * Revision 1.26 1999/05/26 07:49:14 adam
44 * Revision 1.25 1999/05/25 12:33:32 adam
45 * Fixed bug in Tcl filter.
47 * Revision 1.24 1999/05/21 11:08:46 adam
48 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
49 * script so that it reads uninstalled Tcl source.
51 * Revision 1.23 1999/05/20 12:57:18 adam
52 * Implemented TCL filter. Updated recctrl system.
54 * Revision 1.22 1998/11/03 16:07:13 adam
57 * Revision 1.21 1998/11/03 15:43:39 adam
58 * Fixed bug introduced by previous commit.
60 * Revision 1.20 1998/11/03 14:51:28 adam
61 * Changed code so that it creates as few data1 nodes as possible.
63 * Revision 1.19 1998/11/03 10:22:39 adam
64 * Fixed memory leak that could occur for when large data1 node were
65 * concatenated. Data-type data1_nodes may have multiple nodes.
67 * Revision 1.18 1998/10/15 13:11:47 adam
68 * Added support for option -record for "end element". When specified
69 * end element will mark end-of-record when at outer-level.
71 * Revision 1.17 1998/07/01 10:13:51 adam
74 * Revision 1.16 1998/06/30 15:15:09 adam
75 * Tags are trimmed: white space removed before- and after the tag.
77 * Revision 1.15 1998/06/30 12:55:45 adam
80 * Revision 1.14 1998/03/05 08:41:00 adam
81 * Implemented rule contexts.
83 * Revision 1.13 1997/12/12 06:33:58 adam
84 * Fixed bug that showed up when multiple filter where used.
85 * Made one routine thread-safe.
87 * Revision 1.12 1997/11/18 10:03:24 adam
88 * Member num_children removed from data1_node.
90 * Revision 1.11 1997/11/06 11:41:01 adam
91 * Implemented "begin variant" for the sgml.regx filter.
93 * Revision 1.10 1997/10/31 12:36:12 adam
94 * Minor change that avoids compiler warning.
96 * Revision 1.9 1997/09/29 09:02:49 adam
97 * Fixed small bug (introduced by previous commit).
99 * Revision 1.8 1997/09/17 12:19:22 adam
100 * Zebra version corresponds to YAZ version 1.4.
101 * Changed Zebra server so that it doesn't depend on global common_resource.
103 * Revision 1.7 1997/07/15 16:33:07 adam
104 * Check for zero length in execData.
106 * Revision 1.6 1997/02/24 10:41:51 adam
107 * Cleanup of code and commented out the "end element-end-record" code.
109 * Revision 1.5 1997/02/19 16:22:33 adam
110 * Fixed "end element" to terminate record in outer-most level.
112 * Revision 1.4 1997/02/12 20:42:58 adam
113 * Changed some log messages.
115 * Revision 1.3 1996/11/08 14:05:33 adam
116 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
118 * Revision 1.2 1996/10/29 14:02:09 adam
119 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
120 * data1_get_tabpath is used.
122 * Revision 1.1 1996/10/11 10:57:30 adam
123 * New module recctrl. Used to manage records (extract/retrieval).
125 * Revision 1.24 1996/06/17 14:25:31 adam
126 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
128 * Revision 1.23 1996/06/04 10:19:00 adam
129 * Minor changes - removed include of ctype.h.
131 * Revision 1.22 1996/06/03 15:23:13 adam
132 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
134 * Revision 1.21 1996/05/14 16:58:38 adam
137 * Revision 1.20 1996/05/01 13:46:36 adam
138 * First work on multiple records in one file.
139 * New option, -offset, to the "unread" command in the filter module.
141 * Revision 1.19 1996/02/12 16:18:20 adam
142 * Yet another bug fix in implementation of unread command.
144 * Revision 1.18 1996/02/12 16:07:54 adam
145 * Bug fix in new unread command.
147 * Revision 1.17 1996/02/12 15:56:11 adam
148 * New code command: unread.
150 * Revision 1.16 1996/01/17 14:57:51 adam
151 * Prototype changed for reader functions in extract/retrieve. File
152 * is identified by 'void *' instead of 'int.
154 * Revision 1.15 1996/01/08 19:15:47 adam
155 * New input filter that works!
157 * Revision 1.14 1996/01/08 09:10:38 adam
158 * Yet another complete rework on this module.
160 * Revision 1.13 1995/12/15 17:21:50 adam
161 * This version is able to set data.formatted_text in data1-nodes.
163 * Revision 1.12 1995/12/15 16:20:10 adam
164 * The filter files (*.flt) are read from the path given by data1_tabpath.
166 * Revision 1.11 1995/12/15 12:35:16 adam
169 * Revision 1.10 1995/12/15 10:35:36 adam
172 * Revision 1.9 1995/12/14 16:38:48 adam
173 * Completely new attempt to make regular expression parsing.
175 * Revision 1.8 1995/12/13 17:16:59 adam
178 * Revision 1.7 1995/12/13 16:51:58 adam
179 * Modified to set last_child in data1_nodes.
180 * Uses destroy handler to free up data text nodes.
182 * Revision 1.6 1995/12/13 13:45:37 quinn
183 * Changed data1 to use nmem.
185 * Revision 1.5 1995/12/11 09:12:52 adam
186 * The rec_get function returns NULL if record doesn't exist - will
187 * happen in the server if the result set records have been deleted since
188 * the creation of the set (i.e. the search).
189 * The server saves a result temporarily if it is 'volatile', i.e. the
190 * set is register dependent.
192 * Revision 1.4 1995/12/05 16:57:40 adam
193 * More work on regular patterns.
195 * Revision 1.3 1995/12/05 09:37:09 adam
196 * One malloc was renamed to xmalloc.
198 * Revision 1.2 1995/12/04 17:59:24 adam
199 * More work on regular expression conversion.
201 * Revision 1.1 1995/12/04 14:25:30 adam
202 * Started work on regular expression parsed input to structured records.
210 #include <yaz/tpath.h>
211 #include <zebrautl.h>
218 #if MAJOR_VERSION >= 8
219 #define HAVE_TCL_OBJECTS
225 #define F_WIN_EOF 2000000000
229 #define REGX_PATTERN 1
234 #define REGX_CONTEXT 6
244 struct lexRuleAction {
248 struct DFA *dfa; /* REGX_PATTERN */
251 struct regxCode *code; /* REGX_CODE */
253 struct lexRuleAction *next;
258 struct lexRuleAction *actionList;
262 struct lexRuleInfo info;
263 struct lexRule *next;
269 struct lexRule *rules;
270 struct lexRuleInfo **fastRule;
274 struct lexRuleAction *beginActionList;
275 struct lexRuleAction *endActionList;
276 struct lexRuleAction *initActionList;
277 struct lexContext *next;
280 struct lexConcatBuf {
287 struct lexContext *context;
289 struct lexContext **context_stack;
290 int context_stack_size;
291 int context_stack_top;
297 Tcl_Interp *tcl_interp;
300 void (*f_win_ef)(void *, off_t);
302 int f_win_start; /* first byte of buffer is this file offset */
303 int f_win_end; /* last byte of buffer is this offset - 1 */
304 int f_win_size; /* size of buffer */
305 char *f_win_buf; /* buffer itself */
306 int (*f_win_rf)(void *, char *, size_t);
307 off_t (*f_win_sf)(void *, off_t);
309 struct lexConcatBuf *concatBuf;
311 data1_node **d1_stack;
322 struct lexSpec *spec;
325 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
328 int i, r, off = start_pos - spec->f_win_start;
330 if (off >= 0 && end_pos <= spec->f_win_end)
332 *size = end_pos - start_pos;
333 return spec->f_win_buf + off;
335 if (off < 0 || start_pos >= spec->f_win_end)
337 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
338 spec->f_win_start = start_pos;
340 if (!spec->f_win_buf)
341 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
342 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
344 spec->f_win_end = spec->f_win_start + *size;
346 if (*size > end_pos - start_pos)
347 *size = end_pos - start_pos;
348 return spec->f_win_buf;
350 for (i = 0; i<spec->f_win_end - start_pos; i++)
351 spec->f_win_buf[i] = spec->f_win_buf[i + off];
352 r = (*spec->f_win_rf)(spec->f_win_fh,
354 spec->f_win_size - i);
355 spec->f_win_start = start_pos;
356 spec->f_win_end += r;
358 if (*size > end_pos - start_pos)
359 *size = end_pos - start_pos;
360 return spec->f_win_buf;
363 static int f_win_advance (struct lexSpec *spec, int *pos)
368 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
369 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
370 if (*pos == F_WIN_EOF)
372 buf = f_win_get (spec, *pos, *pos+1, &size);
382 static void regxCodeDel (struct regxCode **pp)
384 struct regxCode *p = *pp;
389 Tcl_DecrRefCount (p->tcl_obj);
397 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
401 p = (struct regxCode *) xmalloc (sizeof(*p));
402 p->str = (char *) xmalloc (len+1);
403 memcpy (p->str, buf, len);
406 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
408 Tcl_IncrRefCount (p->tcl_obj);
413 static struct DFA *lexSpecDFA (void)
418 dfa_parse_cmap_del (dfa, ' ');
419 dfa_parse_cmap_del (dfa, '\t');
420 dfa_parse_cmap_add (dfa, '/', 0);
424 static void actionListDel (struct lexRuleAction **rap)
426 struct lexRuleAction *ra1, *ra;
428 for (ra = *rap; ra; ra = ra1)
434 dfa_delete (&ra->u.pattern.dfa);
437 regxCodeDel (&ra->u.code);
445 static struct lexContext *lexContextCreate (const char *name)
447 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
449 p->name = xstrdup (name);
452 p->dfa = lexSpecDFA ();
455 p->beginActionList = NULL;
456 p->endActionList = NULL;
457 p->initActionList = NULL;
462 static void lexContextDestroy (struct lexContext *p)
464 struct lexRule *rp, *rp1;
466 dfa_delete (&p->dfa);
468 for (rp = p->rules; rp; rp = rp1)
471 actionListDel (&rp->info.actionList);
474 actionListDel (&p->beginActionList);
475 actionListDel (&p->endActionList);
476 actionListDel (&p->initActionList);
481 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
486 p = (struct lexSpec *) xmalloc (sizeof(*p));
487 p->name = (char *) xmalloc (strlen(name)+1);
488 strcpy (p->name, name);
495 p->context_stack_size = 100;
496 p->context_stack = (struct lexContext **)
497 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
501 p->concatBuf = (struct lexConcatBuf *)
502 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
503 for (i = 0; i < p->maxLevel; i++)
505 p->concatBuf[i].max = 0;
506 p->concatBuf[i].buf = 0;
508 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
513 static void lexSpecDestroy (struct lexSpec **pp)
516 struct lexContext *lt;
524 for (i = 0; i < p->maxLevel; i++)
525 xfree (p->concatBuf[i].buf);
526 xfree (p->concatBuf);
531 struct lexContext *lt_next = lt->next;
532 lexContextDestroy (lt);
537 Tcl_DeleteInterp (p->tcl_interp);
540 xfree (p->f_win_buf);
541 xfree (p->context_stack);
547 static int readParseToken (const char **cpp, int *len)
549 const char *cp = *cpp;
553 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
582 if (*cp >= 'a' && *cp <= 'z')
584 else if (*cp >= 'A' && *cp <= 'Z')
585 cmd[i] = *cp + 'a' - 'A';
588 if (i < (int) sizeof(cmd)-2)
595 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
597 while (*cp && *cp != ' ' && *cp != '\t' &&
598 *cp != '\n' && *cp != '\r')
604 if (!strcmp (cmd, "begin"))
606 else if (!strcmp (cmd, "end"))
608 else if (!strcmp (cmd, "body"))
610 else if (!strcmp (cmd, "context"))
612 else if (!strcmp (cmd, "init"))
616 logf (LOG_WARN, "bad command %s", cmd);
622 static int actionListMk (struct lexSpec *spec, const char *s,
623 struct lexRuleAction **ap)
629 while ((tok = readParseToken (&s, &len)))
637 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
639 regxCodeMk (&(*ap)->u.code, s, len);
643 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
645 (*ap)->u.pattern.body = bodyMark;
647 (*ap)->u.pattern.dfa = lexSpecDFA ();
649 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
654 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
657 dfa_mkstate ((*ap)->u.pattern.dfa);
661 logf (LOG_WARN, "cannot use BEGIN here");
664 logf (LOG_WARN, "cannot use INIT here");
667 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
677 int readOneSpec (struct lexSpec *spec, const char *s)
681 struct lexContext *lc;
683 tok = readParseToken (&s, &len);
684 if (tok == REGX_CONTEXT)
686 char context_name[32];
687 tok = readParseToken (&s, &len);
688 if (tok != REGX_CODE)
690 logf (LOG_WARN, "missing name after CONTEXT keyword");
695 memcpy (context_name, s, len);
696 context_name[len] = '\0';
697 lc = lexContextCreate (context_name);
698 lc->next = spec->context;
703 spec->context = lexContextCreate ("main");
708 actionListDel (&spec->context->beginActionList);
709 actionListMk (spec, s, &spec->context->beginActionList);
712 actionListDel (&spec->context->endActionList);
713 actionListMk (spec, s, &spec->context->endActionList);
716 actionListDel (&spec->context->initActionList);
717 actionListMk (spec, s, &spec->context->initActionList);
721 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
723 r = dfa_parse (spec->context->dfa, &s);
726 logf (LOG_WARN, "regular expression error. r=%d", r);
731 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
735 rp = (struct lexRule *) xmalloc (sizeof(*rp));
736 rp->info.no = spec->context->ruleNo++;
737 rp->next = spec->context->rules;
738 spec->context->rules = rp;
739 actionListMk (spec, s, &rp->info.actionList);
744 int readFileSpec (struct lexSpec *spec)
746 struct lexContext *lc;
747 int c, i, errors = 0;
753 if (spec->tcl_interp)
755 sprintf (fname, "%s.tflt", spec->name);
756 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
761 sprintf (fname, "%s.flt", spec->name);
762 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
766 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
769 logf (LOG_LOG, "reading regx filter %s", fname);
771 if (spec->tcl_interp)
772 logf (LOG_LOG, "Tcl enabled");
774 lineBuf = wrbuf_alloc();
779 wrbuf_rewind (lineBuf);
780 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
782 while (c != '\n' && c != EOF)
795 wrbuf_putc(lineBuf, c);
803 if (c != ' ' && c != '\t')
808 wrbuf_putc(lineBuf, '\0');
809 readOneSpec (spec, wrbuf_buf(lineBuf));
810 spec->lineNo += addLine;
814 wrbuf_free(lineBuf, 1);
819 debug_dfa_followpos = 1;
822 for (lc = spec->context; lc; lc = lc->next)
825 lc->fastRule = (struct lexRuleInfo **)
826 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
827 for (i = 0; i < lc->ruleNo; i++)
828 lc->fastRule[i] = NULL;
829 for (rp = lc->rules; rp; rp = rp->next)
830 lc->fastRule[rp->info.no] = &rp->info;
831 dfa_mkstate (lc->dfa);
840 static struct lexSpec *curLexSpec = NULL;
843 static void execData (struct lexSpec *spec,
844 const char *ebuf, int elen, int formatted_text)
846 struct data1_node *res, *parent;
849 if (elen == 0) /* shouldn't happen, but it does! */
853 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
854 ebuf, 15, ebuf + elen-15);
856 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
858 logf (LOG_LOG, "data (%d bytes)", elen);
861 if (spec->d1_level <= 1)
864 parent = spec->d1_stack[spec->d1_level -1];
867 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
868 org_len = res->u.data.len;
873 res = data1_mk_node (spec->dh, spec->m);
874 res->parent = parent;
875 res->which = DATA1N_data;
876 res->u.data.what = DATA1I_text;
878 res->u.data.formatted_text = formatted_text;
880 if (elen > DATA1_LOCALDATA)
881 res->u.data.data = nmem_malloc (spec->m, elen);
883 res->u.data.data = res->lbuf;
884 memcpy (res->u.data.data, ebuf, elen);
886 res->u.data.data = 0;
888 res->root = parent->root;
890 parent->last_child = res;
891 if (spec->d1_stack[spec->d1_level])
892 spec->d1_stack[spec->d1_level]->next = res;
895 spec->d1_stack[spec->d1_level] = res;
897 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
899 char *old_buf, *new_buf;
901 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
902 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
903 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
905 memcpy (new_buf, old_buf, org_len);
908 spec->concatBuf[spec->d1_level].buf = new_buf;
910 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
911 res->u.data.len += elen;
914 static void execDataP (struct lexSpec *spec,
915 const char *ebuf, int elen, int formatted_text)
917 execData (spec, ebuf, elen, formatted_text);
920 static void tagDataRelease (struct lexSpec *spec)
924 if ((res = spec->d1_stack[spec->d1_level]) &&
925 res->which == DATA1N_data &&
926 res->u.data.what == DATA1I_text)
928 assert (!res->u.data.data);
929 assert (res->u.data.len > 0);
930 if (res->u.data.len > DATA1_LOCALDATA)
931 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
933 res->u.data.data = res->lbuf;
934 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
939 static void variantBegin (struct lexSpec *spec,
940 const char *class_str, int class_len,
941 const char *type_str, int type_len,
942 const char *value_str, int value_len)
944 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
945 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
950 if (spec->d1_level == 0)
952 logf (LOG_WARN, "in variant begin. No record type defined");
955 if (class_len >= DATA1_MAX_SYMBOL)
956 class_len = DATA1_MAX_SYMBOL-1;
957 memcpy (tclass, class_str, class_len);
958 tclass[class_len] = '\0';
960 if (type_len >= DATA1_MAX_SYMBOL)
961 type_len = DATA1_MAX_SYMBOL-1;
962 memcpy (ttype, type_str, type_len);
963 ttype[type_len] = '\0';
966 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
971 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
975 if (parent->which != DATA1N_variant)
977 res = data1_mk_node (spec->dh, spec->m);
978 res->parent = parent;
979 res->which = DATA1N_variant;
980 res->u.variant.type = 0;
981 res->u.variant.value = 0;
982 res->root = parent->root;
984 parent->last_child = res;
985 if (spec->d1_stack[spec->d1_level])
987 tagDataRelease (spec);
988 spec->d1_stack[spec->d1_level]->next = res;
992 spec->d1_stack[spec->d1_level] = res;
993 spec->d1_stack[++(spec->d1_level)] = NULL;
995 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
996 if (spec->d1_stack[i]->u.variant.type == tp)
1003 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
1005 parent = spec->d1_stack[spec->d1_level-1];
1006 res = data1_mk_node (spec->dh, spec->m);
1007 res->parent = parent;
1008 res->which = DATA1N_variant;
1009 res->root = parent->root;
1010 res->u.variant.type = tp;
1012 if (value_len >= DATA1_LOCALDATA)
1013 value_len =DATA1_LOCALDATA-1;
1014 memcpy (res->lbuf, value_str, value_len);
1015 res->lbuf[value_len] = '\0';
1017 res->u.variant.value = res->lbuf;
1019 parent->last_child = res;
1020 if (spec->d1_stack[spec->d1_level])
1022 tagDataRelease (spec);
1023 spec->d1_stack[spec->d1_level]->next = res;
1026 parent->child = res;
1027 spec->d1_stack[spec->d1_level] = res;
1028 spec->d1_stack[++(spec->d1_level)] = NULL;
1031 static void tagStrip (const char **tag, int *len)
1035 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1038 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1044 static void tagBegin (struct lexSpec *spec,
1045 const char *tag, int len)
1047 struct data1_node *parent;
1048 data1_element *elem = NULL;
1051 data1_element *e = NULL;
1054 if (spec->d1_level == 0)
1056 logf (LOG_WARN, "in element begin. No record type defined");
1059 tagStrip (&tag, &len);
1061 parent = spec->d1_stack[spec->d1_level -1];
1062 partag = get_parent_tag(spec->dh, parent);
1064 res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
1065 res->parent = parent;
1067 if (len >= DATA1_LOCALDATA)
1068 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1070 res->u.tag.tag = res->lbuf;
1072 memcpy (res->u.tag.tag, tag, len);
1073 res->u.tag.tag[len] = '\0';
1076 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1078 if (parent->which == DATA1N_variant)
1081 if (!(e = partag->u.tag.element))
1084 elem = data1_getelementbytagname (spec->dh,
1085 spec->d1_stack[0]->u.root.absyn,
1087 res->u.tag.element = elem;
1088 res->root = parent->root;
1090 parent->last_child = res;
1091 if (spec->d1_stack[spec->d1_level])
1093 tagDataRelease (spec);
1094 spec->d1_stack[spec->d1_level]->next = res;
1097 parent->child = res;
1098 spec->d1_stack[spec->d1_level] = res;
1099 spec->d1_stack[++(spec->d1_level)] = NULL;
1102 static void tagEnd (struct lexSpec *spec, int min_level,
1103 const char *tag, int len)
1105 tagStrip (&tag, &len);
1106 while (spec->d1_level > min_level)
1108 tagDataRelease (spec);
1110 if (spec->d1_level == 0)
1112 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1114 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1116 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1120 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
1125 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1128 struct DFA_state *state = dfa->states[0];
1131 unsigned char c_prev = 0;
1132 int ptr = *pptr; /* current pointer */
1133 int start_ptr = *pptr; /* first char of match */
1134 int last_ptr = 0; /* last char of match */
1135 int last_rule = 0; /* rule number of current match */
1140 c = f_win_advance (spec, &ptr);
1141 if (ptr == F_WIN_EOF)
1158 *mptr = start_ptr; /* match starts here */
1159 *pptr = last_ptr; /* match end here (+1) */
1162 state = dfa->states[0];
1167 else if (c >= t->ch[0] && c <= t->ch[1])
1169 state = dfa->states[t->to];
1174 last_rule = state->rule_no;
1179 last_rule = state->rule_nno;
1191 static int execTok (struct lexSpec *spec, const char **src,
1192 const char **tokBuf, int *tokLen)
1194 const char *s = *src;
1196 while (*s == ' ' || *s == '\t')
1200 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1204 while (*s >= '0' && *s <= '9')
1205 n = n*10 + (*s++ -'0');
1206 if (spec->arg_no == 0)
1213 if (n >= spec->arg_no)
1215 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1219 else if (*s == '\"')
1222 while (*s && *s != '\"')
1224 *tokLen = s - *tokBuf;
1229 else if (*s == '\n' || *s == ';')
1237 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1240 *tokLen = s - *tokBuf;
1247 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1250 *tokLen = s - *tokBuf;
1256 static char *regxStrz (const char *src, int len, char *str)
1260 memcpy (str, src, len);
1266 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1267 int argc, char **argv)
1269 struct lexSpec *spec = (struct lexSpec *) clientData;
1272 if (!strcmp(argv[1], "record") && argc == 3)
1274 char *absynName = argv[2];
1278 logf (LOG_LOG, "begin record %s", absynName);
1280 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1281 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1286 res = data1_mk_node (spec->dh, spec->m);
1287 res->which = DATA1N_root;
1289 data1_insert_string(spec->dh, res, spec->m, absynName);
1290 res->u.root.absyn = absyn;
1293 spec->d1_stack[spec->d1_level] = res;
1294 spec->d1_stack[++(spec->d1_level)] = NULL;
1297 else if (!strcmp(argv[1], "element") && argc == 3)
1299 tagBegin (spec, argv[2], strlen(argv[2]));
1301 else if (!strcmp (argv[1], "variant") && argc == 5)
1303 variantBegin (spec, argv[2], strlen(argv[2]),
1304 argv[3], strlen(argv[3]),
1305 argv[4], strlen(argv[4]));
1307 else if (!strcmp (argv[1], "context") && argc == 3)
1309 struct lexContext *lc = spec->context;
1311 logf (LOG_LOG, "begin context %s",argv[2]);
1313 while (lc && strcmp (argv[2], lc->name))
1317 spec->context_stack[++(spec->context_stack_top)] = lc;
1320 logf (LOG_WARN, "unknown context %s", argv[2]);
1327 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1328 int argc, char **argv)
1330 struct lexSpec *spec = (struct lexSpec *) clientData;
1334 if (!strcmp (argv[1], "record"))
1336 while (spec->d1_level)
1338 tagDataRelease (spec);
1342 logf (LOG_LOG, "end record");
1344 spec->stop_flag = 1;
1346 else if (!strcmp (argv[1], "element"))
1350 if (argc >= 3 && !strcmp(argv[2], "-record"))
1359 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1360 if (spec->d1_level == 0)
1363 logf (LOG_LOG, "end element end records");
1365 spec->stop_flag = 1;
1368 else if (!strcmp (argv[1], "context"))
1371 logf (LOG_LOG, "end context");
1373 if (spec->context_stack_top)
1374 (spec->context_stack_top)--;
1381 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1382 int argc, char **argv)
1386 const char *element = 0;
1387 struct lexSpec *spec = (struct lexSpec *) clientData;
1391 if (!strcmp("-text", argv[argi]))
1396 else if (!strcmp("-element", argv[argi]))
1400 element = argv[argi++];
1406 tagBegin (spec, element, strlen(element));
1410 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1412 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1413 execData (spec, native, strlen(native), textFlag);
1414 Tcl_DStringFree (&ds);
1416 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1421 tagEnd (spec, 1, NULL, 0);
1425 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1426 int argc, char **argv)
1428 struct lexSpec *spec = (struct lexSpec *) clientData;
1435 if (!strcmp("-offset", argv[argi]))
1440 offset = atoi(argv[argi]);
1449 no = atoi(argv[argi]);
1450 if (no >= spec->arg_no)
1451 no = spec->arg_no - 1;
1452 spec->ptr = spec->arg_start[no] + offset;
1456 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1460 for (i = 0; i < spec->arg_no; i++)
1462 char var_name[10], *var_buf;
1465 sprintf (var_name, "%d", i);
1466 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1470 ch = var_buf[var_len];
1471 var_buf[var_len] = '\0';
1472 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1473 var_buf[var_len] = ch;
1476 #if HAVE_TCL_OBJECTS
1477 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1479 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1483 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1484 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1485 spec->tcl_interp->errorLine,
1486 spec->tcl_interp->result,
1487 err ? err : "[NO ERRORINFO]");
1493 static void execCode (struct lexSpec *spec, struct regxCode *code)
1495 const char *s = code->str;
1497 const char *cmd_str;
1499 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 r = execTok (spec, &s, &cmd_str, &cmd_len);
1509 p = regxStrz (cmd_str, cmd_len, ptmp);
1510 if (!strcmp (p, "begin"))
1512 r = execTok (spec, &s, &cmd_str, &cmd_len);
1515 logf (LOG_WARN, "missing keyword after 'begin'");
1518 p = regxStrz (cmd_str, cmd_len, ptmp);
1519 if (!strcmp (p, "record"))
1521 r = execTok (spec, &s, &cmd_str, &cmd_len);
1524 if (spec->d1_level == 0)
1526 static char absynName[64];
1531 memcpy (absynName, cmd_str, cmd_len);
1532 absynName[cmd_len] = '\0';
1535 logf (LOG_LOG, "begin record %s", absynName);
1537 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1538 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1543 res = data1_mk_node (spec->dh, spec->m);
1544 res->which = DATA1N_root;
1545 res->u.root.type = absynName;
1546 res->u.root.absyn = absyn;
1549 spec->d1_stack[spec->d1_level] = res;
1550 spec->d1_stack[++(spec->d1_level)] = NULL;
1553 r = execTok (spec, &s, &cmd_str, &cmd_len);
1555 else if (!strcmp (p, "element"))
1557 r = execTok (spec, &s, &cmd_str, &cmd_len);
1560 tagBegin (spec, cmd_str, cmd_len);
1561 r = execTok (spec, &s, &cmd_str, &cmd_len);
1563 else if (!strcmp (p, "variant"))
1566 const char *class_str = NULL;
1568 const char *type_str = NULL;
1570 const char *value_str = NULL;
1571 r = execTok (spec, &s, &cmd_str, &cmd_len);
1574 class_str = cmd_str;
1575 class_len = cmd_len;
1576 r = execTok (spec, &s, &cmd_str, &cmd_len);
1582 r = execTok (spec, &s, &cmd_str, &cmd_len);
1585 value_str = cmd_str;
1586 value_len = cmd_len;
1588 variantBegin (spec, class_str, class_len,
1589 type_str, type_len, value_str, value_len);
1592 r = execTok (spec, &s, &cmd_str, &cmd_len);
1594 else if (!strcmp (p, "context"))
1598 struct lexContext *lc = spec->context;
1599 r = execTok (spec, &s, &cmd_str, &cmd_len);
1600 p = regxStrz (cmd_str, cmd_len, ptmp);
1602 logf (LOG_LOG, "begin context %s", p);
1604 while (lc && strcmp (p, lc->name))
1607 spec->context_stack[++(spec->context_stack_top)] = lc;
1609 logf (LOG_WARN, "unknown context %s", p);
1612 r = execTok (spec, &s, &cmd_str, &cmd_len);
1616 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1619 else if (!strcmp (p, "end"))
1621 r = execTok (spec, &s, &cmd_str, &cmd_len);
1624 logf (LOG_WARN, "missing keyword after 'end'");
1627 p = regxStrz (cmd_str, cmd_len, ptmp);
1628 if (!strcmp (p, "record"))
1630 while (spec->d1_level)
1632 tagDataRelease (spec);
1635 r = execTok (spec, &s, &cmd_str, &cmd_len);
1637 logf (LOG_LOG, "end record");
1639 spec->stop_flag = 1;
1641 else if (!strcmp (p, "element"))
1644 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1646 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1651 tagEnd (spec, min_level, cmd_str, cmd_len);
1652 r = execTok (spec, &s, &cmd_str, &cmd_len);
1655 tagEnd (spec, min_level, NULL, 0);
1656 if (spec->d1_level == 0)
1659 logf (LOG_LOG, "end element end records");
1661 spec->stop_flag = 1;
1665 else if (!strcmp (p, "context"))
1668 logf (LOG_LOG, "end context");
1670 if (spec->context_stack_top)
1671 (spec->context_stack_top)--;
1672 r = execTok (spec, &s, &cmd_str, &cmd_len);
1675 logf (LOG_WARN, "bad keyword '%s' after end", p);
1677 else if (!strcmp (p, "data"))
1681 const char *element_str = NULL;
1683 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1685 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1687 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1689 r = execTok (spec, &s, &element_str, &element_len);
1694 logf (LOG_WARN, "bad data option: %.*s",
1699 logf (LOG_WARN, "missing data item after data");
1703 tagBegin (spec, element_str, element_len);
1706 execData (spec, cmd_str, cmd_len,textFlag);
1707 r = execTok (spec, &s, &cmd_str, &cmd_len);
1710 tagEnd (spec, 1, NULL, 0);
1712 else if (!strcmp (p, "unread"))
1715 r = execTok (spec, &s, &cmd_str, &cmd_len);
1716 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1718 r = execTok (spec, &s, &cmd_str, &cmd_len);
1721 logf (LOG_WARN, "missing number after -offset");
1724 p = regxStrz (cmd_str, cmd_len, ptmp);
1726 r = execTok (spec, &s, &cmd_str, &cmd_len);
1732 logf (LOG_WARN, "missing index after unread command");
1735 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1737 logf (LOG_WARN, "bad index after unread command");
1742 no = *cmd_str - '0';
1743 if (no >= spec->arg_no)
1744 no = spec->arg_no - 1;
1745 spec->ptr = spec->arg_start[no] + offset;
1747 r = execTok (spec, &s, &cmd_str, &cmd_len);
1749 else if (!strcmp (p, "context"))
1753 struct lexContext *lc = spec->context;
1754 r = execTok (spec, &s, &cmd_str, &cmd_len);
1755 p = regxStrz (cmd_str, cmd_len, ptmp);
1757 while (lc && strcmp (p, lc->name))
1760 spec->context_stack[spec->context_stack_top] = lc;
1762 logf (LOG_WARN, "unknown context %s", p);
1765 r = execTok (spec, &s, &cmd_str, &cmd_len);
1769 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1770 r = execTok (spec, &s, &cmd_str, &cmd_len);
1775 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1777 r = execTok (spec, &s, &cmd_str, &cmd_len);
1784 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1785 int start_ptr, int *pptr)
1794 arg_start[0] = start_ptr;
1796 spec->arg_start = arg_start;
1797 spec->arg_end = arg_end;
1804 if (ap->u.pattern.body)
1806 arg_start[arg_no] = *pptr;
1807 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1809 arg_end[arg_no] = F_WIN_EOF;
1811 arg_start[arg_no] = F_WIN_EOF;
1812 arg_end[arg_no] = F_WIN_EOF;
1817 arg_end[arg_no] = sptr;
1819 arg_start[arg_no] = sptr;
1820 arg_end[arg_no] = *pptr;
1825 arg_start[arg_no] = *pptr;
1826 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1828 if (sptr != arg_start[arg_no])
1830 arg_end[arg_no] = *pptr;
1835 spec->arg_no = arg_no;
1838 if (spec->tcl_interp)
1839 execTcl(spec, ap->u.code);
1841 execCode (spec, ap->u.code);
1843 execCode (spec, ap->u.code);
1846 if (spec->stop_flag)
1850 arg_start[arg_no] = *pptr;
1851 arg_end[arg_no] = F_WIN_EOF;
1860 static int execRule (struct lexSpec *spec, struct lexContext *context,
1861 int ruleNo, int start_ptr, int *pptr)
1864 logf (LOG_LOG, "exec rule %d", ruleNo);
1866 return execAction (spec, context->fastRule[ruleNo]->actionList,
1870 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1872 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1873 struct DFA_state *state = context->dfa->states[0];
1876 unsigned char c_prev = '\n';
1878 int last_rule = 0; /* rule number of current match */
1879 int last_ptr = *ptr; /* last char of match */
1880 int start_ptr = *ptr; /* first char of match */
1881 int skip_ptr = *ptr; /* first char of run */
1885 c = f_win_advance (spec, ptr);
1886 if (*ptr == F_WIN_EOF)
1888 /* end of file met */
1891 /* there was a match */
1892 if (skip_ptr < start_ptr)
1894 /* deal with chars that didn't match */
1897 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1898 execDataP (spec, buf, size, 0);
1900 /* restore pointer */
1903 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1905 /* restore skip pointer */
1909 else if (skip_ptr < *ptr)
1911 /* deal with chars that didn't match */
1914 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1915 execDataP (spec, buf, size, 0);
1917 if (*ptr == F_WIN_EOF)
1924 { /* no transition for character c ... */
1927 if (skip_ptr < start_ptr)
1929 /* deal with chars that didn't match */
1932 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1933 execDataP (spec, buf, size, 0);
1935 /* restore pointer */
1937 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1939 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1942 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1944 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1948 context = spec->context_stack[spec->context_stack_top];
1951 last_ptr = start_ptr = *ptr;
1955 c_prev = f_win_advance (spec, &start_ptr);
1960 c_prev = f_win_advance (spec, &start_ptr);
1963 state = context->dfa->states[0];
1966 else if (c >= t->ch[0] && c <= t->ch[1])
1967 { /* transition ... */
1968 state = context->dfa->states[t->to];
1973 last_rule = state->rule_no;
1976 else if (state->rule_nno)
1978 last_rule = state->rule_nno;
1990 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1991 const char *context_name)
1993 struct lexContext *lt = spec->context;
1996 spec->stop_flag = 0;
1998 spec->context_stack_top = 0;
2001 if (!strcmp (lt->name, context_name))
2007 logf (LOG_WARN, "cannot find context %s", context_name);
2010 spec->context_stack[spec->context_stack_top] = lt;
2011 spec->d1_stack[spec->d1_level] = NULL;
2016 execAction (spec, lt->initActionList, ptr, &ptr);
2019 execAction (spec, lt->beginActionList, ptr, &ptr);
2020 lexNode (spec, &ptr);
2021 while (spec->d1_level)
2023 tagDataRelease (spec);
2026 execAction (spec, lt->endActionList, ptr, &ptr);
2027 return spec->d1_stack[0];
2030 void grs_destroy(void *clientData)
2032 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2035 lexSpecDestroy(&specs->spec);
2040 void *grs_init(void)
2042 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2047 data1_node *grs_read_regx (struct grs_read_info *p)
2050 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2051 struct lexSpec **curLexSpec = &specs->spec;
2054 logf (LOG_LOG, "grs_read_regx");
2056 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2059 lexSpecDestroy (curLexSpec);
2060 *curLexSpec = lexSpecCreate (p->type, p->dh);
2061 res = readFileSpec (*curLexSpec);
2064 lexSpecDestroy (curLexSpec);
2068 (*curLexSpec)->dh = p->dh;
2071 (*curLexSpec)->f_win_start = 0;
2072 (*curLexSpec)->f_win_end = 0;
2073 (*curLexSpec)->f_win_rf = p->readf;
2074 (*curLexSpec)->f_win_sf = p->seekf;
2075 (*curLexSpec)->f_win_fh = p->fh;
2076 (*curLexSpec)->f_win_ef = p->endf;
2077 (*curLexSpec)->f_win_size = 500000;
2079 (*curLexSpec)->m = p->mem;
2080 return lexRoot (*curLexSpec, p->offset, "main");
2083 static struct recTypeGrs regx_type = {
2090 RecTypeGrs recTypeGrs_regx = ®x_type;
2093 data1_node *grs_read_tcl (struct grs_read_info *p)
2096 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2097 struct lexSpec **curLexSpec = &specs->spec;
2100 logf (LOG_LOG, "grs_read_tcl");
2102 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2104 Tcl_Interp *tcl_interp;
2106 lexSpecDestroy (curLexSpec);
2107 *curLexSpec = lexSpecCreate (p->type, p->dh);
2108 Tcl_FindExecutable("");
2109 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2110 Tcl_Init(tcl_interp);
2111 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2112 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2113 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2114 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2116 res = readFileSpec (*curLexSpec);
2119 lexSpecDestroy (curLexSpec);
2123 (*curLexSpec)->dh = p->dh;
2126 (*curLexSpec)->f_win_start = 0;
2127 (*curLexSpec)->f_win_end = 0;
2128 (*curLexSpec)->f_win_rf = p->readf;
2129 (*curLexSpec)->f_win_sf = p->seekf;
2130 (*curLexSpec)->f_win_fh = p->fh;
2131 (*curLexSpec)->f_win_ef = p->endf;
2132 (*curLexSpec)->f_win_size = 500000;
2134 (*curLexSpec)->m = p->mem;
2135 return lexRoot (*curLexSpec, p->offset, "main");
2138 static struct recTypeGrs tcl_type = {
2145 RecTypeGrs recTypeGrs_tcl = &tcl_type;