2 * Copyright (C) 1994-2001, Index Data
6 * Revision 1.38 2002-04-04 20:50:37 adam
7 * Multi register works with record paths and data1 profile path
9 * Revision 1.37 2001/05/29 08:51:59 adam
10 * More fixes for character encodings.
12 * Revision 1.36 2001/05/22 21:02:26 adam
13 * Fixes for Tcl UTF8 character handling.
15 * Revision 1.35 2001/03/29 21:31:31 adam
16 * Fixed "record begin" for Tcl filter.
18 * Revision 1.34 2000/11/29 14:24:01 adam
19 * Script configure uses yaz pthreads options. Added locking for
20 * zebra_register_{lock,unlock}.
22 * Revision 1.33 1999/11/30 13:48:04 adam
23 * Improved installation. Updated for inclusion of YAZ header files.
25 * Revision 1.32 1999/09/07 07:19:21 adam
26 * Work on character mapping. Implemented replace rules.
28 * Revision 1.31 1999/07/14 13:05:29 adam
29 * Tcl filter works with objects when TCL is version 8 or later; filter
30 * works with strings otherwise (slow).
32 * Revision 1.30 1999/07/14 10:55:28 adam
35 * Revision 1.29 1999/07/12 07:27:54 adam
36 * Improved speed of Tcl processing. Fixed one memory leak.
38 * Revision 1.28 1999/07/06 12:26:04 adam
39 * Fixed filters so that MS-DOS CR is ignored.
41 * Revision 1.27 1999/06/28 13:25:40 quinn
42 * Improved diagnostics for Tcl
44 * Revision 1.26 1999/05/26 07:49:14 adam
47 * Revision 1.25 1999/05/25 12:33:32 adam
48 * Fixed bug in Tcl filter.
50 * Revision 1.24 1999/05/21 11:08:46 adam
51 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
52 * script so that it reads uninstalled Tcl source.
54 * Revision 1.23 1999/05/20 12:57:18 adam
55 * Implemented TCL filter. Updated recctrl system.
57 * Revision 1.22 1998/11/03 16:07:13 adam
60 * Revision 1.21 1998/11/03 15:43:39 adam
61 * Fixed bug introduced by previous commit.
63 * Revision 1.20 1998/11/03 14:51:28 adam
64 * Changed code so that it creates as few data1 nodes as possible.
66 * Revision 1.19 1998/11/03 10:22:39 adam
67 * Fixed memory leak that could occur for when large data1 node were
68 * concatenated. Data-type data1_nodes may have multiple nodes.
70 * Revision 1.18 1998/10/15 13:11:47 adam
71 * Added support for option -record for "end element". When specified
72 * end element will mark end-of-record when at outer-level.
74 * Revision 1.17 1998/07/01 10:13:51 adam
77 * Revision 1.16 1998/06/30 15:15:09 adam
78 * Tags are trimmed: white space removed before- and after the tag.
80 * Revision 1.15 1998/06/30 12:55:45 adam
83 * Revision 1.14 1998/03/05 08:41:00 adam
84 * Implemented rule contexts.
86 * Revision 1.13 1997/12/12 06:33:58 adam
87 * Fixed bug that showed up when multiple filter where used.
88 * Made one routine thread-safe.
90 * Revision 1.12 1997/11/18 10:03:24 adam
91 * Member num_children removed from data1_node.
93 * Revision 1.11 1997/11/06 11:41:01 adam
94 * Implemented "begin variant" for the sgml.regx filter.
96 * Revision 1.10 1997/10/31 12:36:12 adam
97 * Minor change that avoids compiler warning.
99 * Revision 1.9 1997/09/29 09:02:49 adam
100 * Fixed small bug (introduced by previous commit).
102 * Revision 1.8 1997/09/17 12:19:22 adam
103 * Zebra version corresponds to YAZ version 1.4.
104 * Changed Zebra server so that it doesn't depend on global common_resource.
106 * Revision 1.7 1997/07/15 16:33:07 adam
107 * Check for zero length in execData.
109 * Revision 1.6 1997/02/24 10:41:51 adam
110 * Cleanup of code and commented out the "end element-end-record" code.
112 * Revision 1.5 1997/02/19 16:22:33 adam
113 * Fixed "end element" to terminate record in outer-most level.
115 * Revision 1.4 1997/02/12 20:42:58 adam
116 * Changed some log messages.
118 * Revision 1.3 1996/11/08 14:05:33 adam
119 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
121 * Revision 1.2 1996/10/29 14:02:09 adam
122 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
123 * data1_get_tabpath is used.
125 * Revision 1.1 1996/10/11 10:57:30 adam
126 * New module recctrl. Used to manage records (extract/retrieval).
128 * Revision 1.24 1996/06/17 14:25:31 adam
129 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
131 * Revision 1.23 1996/06/04 10:19:00 adam
132 * Minor changes - removed include of ctype.h.
134 * Revision 1.22 1996/06/03 15:23:13 adam
135 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
137 * Revision 1.21 1996/05/14 16:58:38 adam
140 * Revision 1.20 1996/05/01 13:46:36 adam
141 * First work on multiple records in one file.
142 * New option, -offset, to the "unread" command in the filter module.
144 * Revision 1.19 1996/02/12 16:18:20 adam
145 * Yet another bug fix in implementation of unread command.
147 * Revision 1.18 1996/02/12 16:07:54 adam
148 * Bug fix in new unread command.
150 * Revision 1.17 1996/02/12 15:56:11 adam
151 * New code command: unread.
153 * Revision 1.16 1996/01/17 14:57:51 adam
154 * Prototype changed for reader functions in extract/retrieve. File
155 * is identified by 'void *' instead of 'int.
157 * Revision 1.15 1996/01/08 19:15:47 adam
158 * New input filter that works!
160 * Revision 1.14 1996/01/08 09:10:38 adam
161 * Yet another complete rework on this module.
163 * Revision 1.13 1995/12/15 17:21:50 adam
164 * This version is able to set data.formatted_text in data1-nodes.
166 * Revision 1.12 1995/12/15 16:20:10 adam
167 * The filter files (*.flt) are read from the path given by data1_tabpath.
169 * Revision 1.11 1995/12/15 12:35:16 adam
172 * Revision 1.10 1995/12/15 10:35:36 adam
175 * Revision 1.9 1995/12/14 16:38:48 adam
176 * Completely new attempt to make regular expression parsing.
178 * Revision 1.8 1995/12/13 17:16:59 adam
181 * Revision 1.7 1995/12/13 16:51:58 adam
182 * Modified to set last_child in data1_nodes.
183 * Uses destroy handler to free up data text nodes.
185 * Revision 1.6 1995/12/13 13:45:37 quinn
186 * Changed data1 to use nmem.
188 * Revision 1.5 1995/12/11 09:12:52 adam
189 * The rec_get function returns NULL if record doesn't exist - will
190 * happen in the server if the result set records have been deleted since
191 * the creation of the set (i.e. the search).
192 * The server saves a result temporarily if it is 'volatile', i.e. the
193 * set is register dependent.
195 * Revision 1.4 1995/12/05 16:57:40 adam
196 * More work on regular patterns.
198 * Revision 1.3 1995/12/05 09:37:09 adam
199 * One malloc was renamed to xmalloc.
201 * Revision 1.2 1995/12/04 17:59:24 adam
202 * More work on regular expression conversion.
204 * Revision 1.1 1995/12/04 14:25:30 adam
205 * Started work on regular expression parsed input to structured records.
213 #include <yaz/tpath.h>
214 #include <zebrautl.h>
221 #if MAJOR_VERSION >= 8
222 #define HAVE_TCL_OBJECTS
228 #define F_WIN_EOF 2000000000
232 #define REGX_PATTERN 1
237 #define REGX_CONTEXT 6
247 struct lexRuleAction {
251 struct DFA *dfa; /* REGX_PATTERN */
254 struct regxCode *code; /* REGX_CODE */
256 struct lexRuleAction *next;
261 struct lexRuleAction *actionList;
265 struct lexRuleInfo info;
266 struct lexRule *next;
272 struct lexRule *rules;
273 struct lexRuleInfo **fastRule;
277 struct lexRuleAction *beginActionList;
278 struct lexRuleAction *endActionList;
279 struct lexRuleAction *initActionList;
280 struct lexContext *next;
283 struct lexConcatBuf {
290 struct lexContext *context;
292 struct lexContext **context_stack;
293 int context_stack_size;
294 int context_stack_top;
300 Tcl_Interp *tcl_interp;
303 void (*f_win_ef)(void *, off_t);
305 int f_win_start; /* first byte of buffer is this file offset */
306 int f_win_end; /* last byte of buffer is this offset - 1 */
307 int f_win_size; /* size of buffer */
308 char *f_win_buf; /* buffer itself */
309 int (*f_win_rf)(void *, char *, size_t);
310 off_t (*f_win_sf)(void *, off_t);
312 struct lexConcatBuf *concatBuf;
314 data1_node **d1_stack;
325 struct lexSpec *spec;
328 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
331 int i, r, off = start_pos - spec->f_win_start;
333 if (off >= 0 && end_pos <= spec->f_win_end)
335 *size = end_pos - start_pos;
336 return spec->f_win_buf + off;
338 if (off < 0 || start_pos >= spec->f_win_end)
340 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
341 spec->f_win_start = start_pos;
343 if (!spec->f_win_buf)
344 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
345 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
347 spec->f_win_end = spec->f_win_start + *size;
349 if (*size > end_pos - start_pos)
350 *size = end_pos - start_pos;
351 return spec->f_win_buf;
353 for (i = 0; i<spec->f_win_end - start_pos; i++)
354 spec->f_win_buf[i] = spec->f_win_buf[i + off];
355 r = (*spec->f_win_rf)(spec->f_win_fh,
357 spec->f_win_size - i);
358 spec->f_win_start = start_pos;
359 spec->f_win_end += r;
361 if (*size > end_pos - start_pos)
362 *size = end_pos - start_pos;
363 return spec->f_win_buf;
366 static int f_win_advance (struct lexSpec *spec, int *pos)
371 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
372 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
373 if (*pos == F_WIN_EOF)
375 buf = f_win_get (spec, *pos, *pos+1, &size);
385 static void regxCodeDel (struct regxCode **pp)
387 struct regxCode *p = *pp;
392 Tcl_DecrRefCount (p->tcl_obj);
400 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
404 p = (struct regxCode *) xmalloc (sizeof(*p));
405 p->str = (char *) xmalloc (len+1);
406 memcpy (p->str, buf, len);
409 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
411 Tcl_IncrRefCount (p->tcl_obj);
416 static struct DFA *lexSpecDFA (void)
421 dfa_parse_cmap_del (dfa, ' ');
422 dfa_parse_cmap_del (dfa, '\t');
423 dfa_parse_cmap_add (dfa, '/', 0);
427 static void actionListDel (struct lexRuleAction **rap)
429 struct lexRuleAction *ra1, *ra;
431 for (ra = *rap; ra; ra = ra1)
437 dfa_delete (&ra->u.pattern.dfa);
440 regxCodeDel (&ra->u.code);
448 static struct lexContext *lexContextCreate (const char *name)
450 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
452 p->name = xstrdup (name);
455 p->dfa = lexSpecDFA ();
458 p->beginActionList = NULL;
459 p->endActionList = NULL;
460 p->initActionList = NULL;
465 static void lexContextDestroy (struct lexContext *p)
467 struct lexRule *rp, *rp1;
469 dfa_delete (&p->dfa);
471 for (rp = p->rules; rp; rp = rp1)
474 actionListDel (&rp->info.actionList);
477 actionListDel (&p->beginActionList);
478 actionListDel (&p->endActionList);
479 actionListDel (&p->initActionList);
484 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
489 p = (struct lexSpec *) xmalloc (sizeof(*p));
490 p->name = (char *) xmalloc (strlen(name)+1);
491 strcpy (p->name, name);
498 p->context_stack_size = 100;
499 p->context_stack = (struct lexContext **)
500 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
504 p->concatBuf = (struct lexConcatBuf *)
505 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
506 for (i = 0; i < p->maxLevel; i++)
508 p->concatBuf[i].max = 0;
509 p->concatBuf[i].buf = 0;
511 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
516 static void lexSpecDestroy (struct lexSpec **pp)
519 struct lexContext *lt;
527 for (i = 0; i < p->maxLevel; i++)
528 xfree (p->concatBuf[i].buf);
529 xfree (p->concatBuf);
534 struct lexContext *lt_next = lt->next;
535 lexContextDestroy (lt);
540 Tcl_DeleteInterp (p->tcl_interp);
543 xfree (p->f_win_buf);
544 xfree (p->context_stack);
550 static int readParseToken (const char **cpp, int *len)
552 const char *cp = *cpp;
556 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
585 if (*cp >= 'a' && *cp <= 'z')
587 else if (*cp >= 'A' && *cp <= 'Z')
588 cmd[i] = *cp + 'a' - 'A';
591 if (i < (int) sizeof(cmd)-2)
598 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
600 while (*cp && *cp != ' ' && *cp != '\t' &&
601 *cp != '\n' && *cp != '\r')
607 if (!strcmp (cmd, "begin"))
609 else if (!strcmp (cmd, "end"))
611 else if (!strcmp (cmd, "body"))
613 else if (!strcmp (cmd, "context"))
615 else if (!strcmp (cmd, "init"))
619 logf (LOG_WARN, "bad command %s", cmd);
625 static int actionListMk (struct lexSpec *spec, const char *s,
626 struct lexRuleAction **ap)
632 while ((tok = readParseToken (&s, &len)))
640 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
642 regxCodeMk (&(*ap)->u.code, s, len);
646 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
648 (*ap)->u.pattern.body = bodyMark;
650 (*ap)->u.pattern.dfa = lexSpecDFA ();
652 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
657 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
660 dfa_mkstate ((*ap)->u.pattern.dfa);
664 logf (LOG_WARN, "cannot use BEGIN here");
667 logf (LOG_WARN, "cannot use INIT here");
670 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
680 int readOneSpec (struct lexSpec *spec, const char *s)
684 struct lexContext *lc;
686 tok = readParseToken (&s, &len);
687 if (tok == REGX_CONTEXT)
689 char context_name[32];
690 tok = readParseToken (&s, &len);
691 if (tok != REGX_CODE)
693 logf (LOG_WARN, "missing name after CONTEXT keyword");
698 memcpy (context_name, s, len);
699 context_name[len] = '\0';
700 lc = lexContextCreate (context_name);
701 lc->next = spec->context;
706 spec->context = lexContextCreate ("main");
711 actionListDel (&spec->context->beginActionList);
712 actionListMk (spec, s, &spec->context->beginActionList);
715 actionListDel (&spec->context->endActionList);
716 actionListMk (spec, s, &spec->context->endActionList);
719 actionListDel (&spec->context->initActionList);
720 actionListMk (spec, s, &spec->context->initActionList);
724 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
726 r = dfa_parse (spec->context->dfa, &s);
729 logf (LOG_WARN, "regular expression error. r=%d", r);
734 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
738 rp = (struct lexRule *) xmalloc (sizeof(*rp));
739 rp->info.no = spec->context->ruleNo++;
740 rp->next = spec->context->rules;
741 spec->context->rules = rp;
742 actionListMk (spec, s, &rp->info.actionList);
747 int readFileSpec (struct lexSpec *spec)
749 struct lexContext *lc;
750 int c, i, errors = 0;
756 if (spec->tcl_interp)
758 sprintf (fname, "%s.tflt", spec->name);
759 spec_inf = data1_path_fopen (spec->dh, fname, "r");
764 sprintf (fname, "%s.flt", spec->name);
765 spec_inf = data1_path_fopen (spec->dh, fname, "r");
769 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
772 logf (LOG_LOG, "reading regx filter %s", fname);
774 if (spec->tcl_interp)
775 logf (LOG_LOG, "Tcl enabled");
777 lineBuf = wrbuf_alloc();
782 wrbuf_rewind (lineBuf);
783 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
785 while (c != '\n' && c != EOF)
798 wrbuf_putc(lineBuf, c);
806 if (c != ' ' && c != '\t')
811 wrbuf_putc(lineBuf, '\0');
812 readOneSpec (spec, wrbuf_buf(lineBuf));
813 spec->lineNo += addLine;
817 wrbuf_free(lineBuf, 1);
822 debug_dfa_followpos = 1;
825 for (lc = spec->context; lc; lc = lc->next)
828 lc->fastRule = (struct lexRuleInfo **)
829 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
830 for (i = 0; i < lc->ruleNo; i++)
831 lc->fastRule[i] = NULL;
832 for (rp = lc->rules; rp; rp = rp->next)
833 lc->fastRule[rp->info.no] = &rp->info;
834 dfa_mkstate (lc->dfa);
843 static struct lexSpec *curLexSpec = NULL;
846 static void execData (struct lexSpec *spec,
847 const char *ebuf, int elen, int formatted_text)
849 struct data1_node *res, *parent;
852 if (elen == 0) /* shouldn't happen, but it does! */
856 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
857 ebuf, 15, ebuf + elen-15);
859 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
861 logf (LOG_LOG, "data (%d bytes)", elen);
864 if (spec->d1_level <= 1)
867 parent = spec->d1_stack[spec->d1_level -1];
870 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
871 org_len = res->u.data.len;
876 res = data1_mk_node (spec->dh, spec->m);
877 res->parent = parent;
878 res->which = DATA1N_data;
879 res->u.data.what = DATA1I_text;
881 res->u.data.formatted_text = formatted_text;
883 if (elen > DATA1_LOCALDATA)
884 res->u.data.data = nmem_malloc (spec->m, elen);
886 res->u.data.data = res->lbuf;
887 memcpy (res->u.data.data, ebuf, elen);
889 res->u.data.data = 0;
891 res->root = parent->root;
893 parent->last_child = res;
894 if (spec->d1_stack[spec->d1_level])
895 spec->d1_stack[spec->d1_level]->next = res;
898 spec->d1_stack[spec->d1_level] = res;
900 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
902 char *old_buf, *new_buf;
904 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
905 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
906 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
908 memcpy (new_buf, old_buf, org_len);
911 spec->concatBuf[spec->d1_level].buf = new_buf;
913 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
914 res->u.data.len += elen;
917 static void execDataP (struct lexSpec *spec,
918 const char *ebuf, int elen, int formatted_text)
920 execData (spec, ebuf, elen, formatted_text);
923 static void tagDataRelease (struct lexSpec *spec)
927 if ((res = spec->d1_stack[spec->d1_level]) &&
928 res->which == DATA1N_data &&
929 res->u.data.what == DATA1I_text)
931 assert (!res->u.data.data);
932 assert (res->u.data.len > 0);
933 if (res->u.data.len > DATA1_LOCALDATA)
934 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
936 res->u.data.data = res->lbuf;
937 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
942 static void variantBegin (struct lexSpec *spec,
943 const char *class_str, int class_len,
944 const char *type_str, int type_len,
945 const char *value_str, int value_len)
947 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
948 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
953 if (spec->d1_level == 0)
955 logf (LOG_WARN, "in variant begin. No record type defined");
958 if (class_len >= DATA1_MAX_SYMBOL)
959 class_len = DATA1_MAX_SYMBOL-1;
960 memcpy (tclass, class_str, class_len);
961 tclass[class_len] = '\0';
963 if (type_len >= DATA1_MAX_SYMBOL)
964 type_len = DATA1_MAX_SYMBOL-1;
965 memcpy (ttype, type_str, type_len);
966 ttype[type_len] = '\0';
969 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
974 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
978 if (parent->which != DATA1N_variant)
980 res = data1_mk_node (spec->dh, spec->m);
981 res->parent = parent;
982 res->which = DATA1N_variant;
983 res->u.variant.type = 0;
984 res->u.variant.value = 0;
985 res->root = parent->root;
987 parent->last_child = res;
988 if (spec->d1_stack[spec->d1_level])
990 tagDataRelease (spec);
991 spec->d1_stack[spec->d1_level]->next = res;
995 spec->d1_stack[spec->d1_level] = res;
996 spec->d1_stack[++(spec->d1_level)] = NULL;
998 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
999 if (spec->d1_stack[i]->u.variant.type == tp)
1006 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
1008 parent = spec->d1_stack[spec->d1_level-1];
1009 res = data1_mk_node (spec->dh, spec->m);
1010 res->parent = parent;
1011 res->which = DATA1N_variant;
1012 res->root = parent->root;
1013 res->u.variant.type = tp;
1015 if (value_len >= DATA1_LOCALDATA)
1016 value_len =DATA1_LOCALDATA-1;
1017 memcpy (res->lbuf, value_str, value_len);
1018 res->lbuf[value_len] = '\0';
1020 res->u.variant.value = res->lbuf;
1022 parent->last_child = res;
1023 if (spec->d1_stack[spec->d1_level])
1025 tagDataRelease (spec);
1026 spec->d1_stack[spec->d1_level]->next = res;
1029 parent->child = res;
1030 spec->d1_stack[spec->d1_level] = res;
1031 spec->d1_stack[++(spec->d1_level)] = NULL;
1034 static void tagStrip (const char **tag, int *len)
1038 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1041 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1047 static void tagBegin (struct lexSpec *spec,
1048 const char *tag, int len)
1050 struct data1_node *parent;
1051 data1_element *elem = NULL;
1054 data1_element *e = NULL;
1057 if (spec->d1_level == 0)
1059 logf (LOG_WARN, "in element begin. No record type defined");
1062 tagStrip (&tag, &len);
1064 parent = spec->d1_stack[spec->d1_level -1];
1065 partag = get_parent_tag(spec->dh, parent);
1067 res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
1068 res->parent = parent;
1070 if (len >= DATA1_LOCALDATA)
1071 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1073 res->u.tag.tag = res->lbuf;
1075 memcpy (res->u.tag.tag, tag, len);
1076 res->u.tag.tag[len] = '\0';
1079 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1081 if (parent->which == DATA1N_variant)
1084 if (!(e = partag->u.tag.element))
1087 elem = data1_getelementbytagname (spec->dh,
1088 spec->d1_stack[0]->u.root.absyn,
1090 res->u.tag.element = elem;
1091 res->root = parent->root;
1093 parent->last_child = res;
1094 if (spec->d1_stack[spec->d1_level])
1096 tagDataRelease (spec);
1097 spec->d1_stack[spec->d1_level]->next = res;
1100 parent->child = res;
1101 spec->d1_stack[spec->d1_level] = res;
1102 spec->d1_stack[++(spec->d1_level)] = NULL;
1105 static void tagEnd (struct lexSpec *spec, int min_level,
1106 const char *tag, int len)
1108 tagStrip (&tag, &len);
1109 while (spec->d1_level > min_level)
1111 tagDataRelease (spec);
1113 if (spec->d1_level == 0)
1115 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1117 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1119 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1123 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
1128 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1131 struct DFA_state *state = dfa->states[0];
1134 unsigned char c_prev = 0;
1135 int ptr = *pptr; /* current pointer */
1136 int start_ptr = *pptr; /* first char of match */
1137 int last_ptr = 0; /* last char of match */
1138 int last_rule = 0; /* rule number of current match */
1143 c = f_win_advance (spec, &ptr);
1144 if (ptr == F_WIN_EOF)
1161 *mptr = start_ptr; /* match starts here */
1162 *pptr = last_ptr; /* match end here (+1) */
1165 state = dfa->states[0];
1170 else if (c >= t->ch[0] && c <= t->ch[1])
1172 state = dfa->states[t->to];
1177 last_rule = state->rule_no;
1182 last_rule = state->rule_nno;
1194 static int execTok (struct lexSpec *spec, const char **src,
1195 const char **tokBuf, int *tokLen)
1197 const char *s = *src;
1199 while (*s == ' ' || *s == '\t')
1203 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1207 while (*s >= '0' && *s <= '9')
1208 n = n*10 + (*s++ -'0');
1209 if (spec->arg_no == 0)
1216 if (n >= spec->arg_no)
1218 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1222 else if (*s == '\"')
1225 while (*s && *s != '\"')
1227 *tokLen = s - *tokBuf;
1232 else if (*s == '\n' || *s == ';')
1240 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1243 *tokLen = s - *tokBuf;
1250 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1253 *tokLen = s - *tokBuf;
1259 static char *regxStrz (const char *src, int len, char *str)
1263 memcpy (str, src, len);
1269 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1270 int argc, char **argv)
1272 struct lexSpec *spec = (struct lexSpec *) clientData;
1275 if (!strcmp(argv[1], "record") && argc == 3)
1277 char *absynName = argv[2];
1281 logf (LOG_LOG, "begin record %s", absynName);
1283 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1284 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1289 res = data1_mk_node (spec->dh, spec->m);
1290 res->which = DATA1N_root;
1292 data1_insert_string(spec->dh, res, spec->m, absynName);
1293 res->u.root.absyn = absyn;
1296 spec->d1_stack[spec->d1_level] = res;
1297 spec->d1_stack[++(spec->d1_level)] = NULL;
1300 else if (!strcmp(argv[1], "element") && argc == 3)
1302 tagBegin (spec, argv[2], strlen(argv[2]));
1304 else if (!strcmp (argv[1], "variant") && argc == 5)
1306 variantBegin (spec, argv[2], strlen(argv[2]),
1307 argv[3], strlen(argv[3]),
1308 argv[4], strlen(argv[4]));
1310 else if (!strcmp (argv[1], "context") && argc == 3)
1312 struct lexContext *lc = spec->context;
1314 logf (LOG_LOG, "begin context %s",argv[2]);
1316 while (lc && strcmp (argv[2], lc->name))
1320 spec->context_stack[++(spec->context_stack_top)] = lc;
1323 logf (LOG_WARN, "unknown context %s", argv[2]);
1330 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1331 int argc, char **argv)
1333 struct lexSpec *spec = (struct lexSpec *) clientData;
1337 if (!strcmp (argv[1], "record"))
1339 while (spec->d1_level)
1341 tagDataRelease (spec);
1345 logf (LOG_LOG, "end record");
1347 spec->stop_flag = 1;
1349 else if (!strcmp (argv[1], "element"))
1353 if (argc >= 3 && !strcmp(argv[2], "-record"))
1362 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1363 if (spec->d1_level == 0)
1366 logf (LOG_LOG, "end element end records");
1368 spec->stop_flag = 1;
1371 else if (!strcmp (argv[1], "context"))
1374 logf (LOG_LOG, "end context");
1376 if (spec->context_stack_top)
1377 (spec->context_stack_top)--;
1384 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1385 int argc, char **argv)
1389 const char *element = 0;
1390 struct lexSpec *spec = (struct lexSpec *) clientData;
1394 if (!strcmp("-text", argv[argi]))
1399 else if (!strcmp("-element", argv[argi]))
1403 element = argv[argi++];
1409 tagBegin (spec, element, strlen(element));
1413 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1415 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1416 execData (spec, native, strlen(native), textFlag);
1417 Tcl_DStringFree (&ds);
1419 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1424 tagEnd (spec, 1, NULL, 0);
1428 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1429 int argc, char **argv)
1431 struct lexSpec *spec = (struct lexSpec *) clientData;
1438 if (!strcmp("-offset", argv[argi]))
1443 offset = atoi(argv[argi]);
1452 no = atoi(argv[argi]);
1453 if (no >= spec->arg_no)
1454 no = spec->arg_no - 1;
1455 spec->ptr = spec->arg_start[no] + offset;
1459 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1463 for (i = 0; i < spec->arg_no; i++)
1465 char var_name[10], *var_buf;
1468 sprintf (var_name, "%d", i);
1469 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1473 ch = var_buf[var_len];
1474 var_buf[var_len] = '\0';
1475 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1476 var_buf[var_len] = ch;
1479 #if HAVE_TCL_OBJECTS
1480 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1482 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1486 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1487 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1488 spec->tcl_interp->errorLine,
1489 spec->tcl_interp->result,
1490 err ? err : "[NO ERRORINFO]");
1496 static void execCode (struct lexSpec *spec, struct regxCode *code)
1498 const char *s = code->str;
1500 const char *cmd_str;
1502 r = execTok (spec, &s, &cmd_str, &cmd_len);
1509 r = execTok (spec, &s, &cmd_str, &cmd_len);
1512 p = regxStrz (cmd_str, cmd_len, ptmp);
1513 if (!strcmp (p, "begin"))
1515 r = execTok (spec, &s, &cmd_str, &cmd_len);
1518 logf (LOG_WARN, "missing keyword after 'begin'");
1521 p = regxStrz (cmd_str, cmd_len, ptmp);
1522 if (!strcmp (p, "record"))
1524 r = execTok (spec, &s, &cmd_str, &cmd_len);
1527 if (spec->d1_level == 0)
1529 static char absynName[64];
1534 memcpy (absynName, cmd_str, cmd_len);
1535 absynName[cmd_len] = '\0';
1538 logf (LOG_LOG, "begin record %s", absynName);
1540 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1541 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1546 res = data1_mk_node (spec->dh, spec->m);
1547 res->which = DATA1N_root;
1548 res->u.root.type = absynName;
1549 res->u.root.absyn = absyn;
1552 spec->d1_stack[spec->d1_level] = res;
1553 spec->d1_stack[++(spec->d1_level)] = NULL;
1556 r = execTok (spec, &s, &cmd_str, &cmd_len);
1558 else if (!strcmp (p, "element"))
1560 r = execTok (spec, &s, &cmd_str, &cmd_len);
1563 tagBegin (spec, cmd_str, cmd_len);
1564 r = execTok (spec, &s, &cmd_str, &cmd_len);
1566 else if (!strcmp (p, "variant"))
1569 const char *class_str = NULL;
1571 const char *type_str = NULL;
1573 const char *value_str = NULL;
1574 r = execTok (spec, &s, &cmd_str, &cmd_len);
1577 class_str = cmd_str;
1578 class_len = cmd_len;
1579 r = execTok (spec, &s, &cmd_str, &cmd_len);
1585 r = execTok (spec, &s, &cmd_str, &cmd_len);
1588 value_str = cmd_str;
1589 value_len = cmd_len;
1591 variantBegin (spec, class_str, class_len,
1592 type_str, type_len, value_str, value_len);
1595 r = execTok (spec, &s, &cmd_str, &cmd_len);
1597 else if (!strcmp (p, "context"))
1601 struct lexContext *lc = spec->context;
1602 r = execTok (spec, &s, &cmd_str, &cmd_len);
1603 p = regxStrz (cmd_str, cmd_len, ptmp);
1605 logf (LOG_LOG, "begin context %s", p);
1607 while (lc && strcmp (p, lc->name))
1610 spec->context_stack[++(spec->context_stack_top)] = lc;
1612 logf (LOG_WARN, "unknown context %s", p);
1615 r = execTok (spec, &s, &cmd_str, &cmd_len);
1619 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1622 else if (!strcmp (p, "end"))
1624 r = execTok (spec, &s, &cmd_str, &cmd_len);
1627 logf (LOG_WARN, "missing keyword after 'end'");
1630 p = regxStrz (cmd_str, cmd_len, ptmp);
1631 if (!strcmp (p, "record"))
1633 while (spec->d1_level)
1635 tagDataRelease (spec);
1638 r = execTok (spec, &s, &cmd_str, &cmd_len);
1640 logf (LOG_LOG, "end record");
1642 spec->stop_flag = 1;
1644 else if (!strcmp (p, "element"))
1647 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1649 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1654 tagEnd (spec, min_level, cmd_str, cmd_len);
1655 r = execTok (spec, &s, &cmd_str, &cmd_len);
1658 tagEnd (spec, min_level, NULL, 0);
1659 if (spec->d1_level == 0)
1662 logf (LOG_LOG, "end element end records");
1664 spec->stop_flag = 1;
1668 else if (!strcmp (p, "context"))
1671 logf (LOG_LOG, "end context");
1673 if (spec->context_stack_top)
1674 (spec->context_stack_top)--;
1675 r = execTok (spec, &s, &cmd_str, &cmd_len);
1678 logf (LOG_WARN, "bad keyword '%s' after end", p);
1680 else if (!strcmp (p, "data"))
1684 const char *element_str = NULL;
1686 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1688 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1690 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1692 r = execTok (spec, &s, &element_str, &element_len);
1697 logf (LOG_WARN, "bad data option: %.*s",
1702 logf (LOG_WARN, "missing data item after data");
1706 tagBegin (spec, element_str, element_len);
1709 execData (spec, cmd_str, cmd_len,textFlag);
1710 r = execTok (spec, &s, &cmd_str, &cmd_len);
1713 tagEnd (spec, 1, NULL, 0);
1715 else if (!strcmp (p, "unread"))
1718 r = execTok (spec, &s, &cmd_str, &cmd_len);
1719 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1721 r = execTok (spec, &s, &cmd_str, &cmd_len);
1724 logf (LOG_WARN, "missing number after -offset");
1727 p = regxStrz (cmd_str, cmd_len, ptmp);
1729 r = execTok (spec, &s, &cmd_str, &cmd_len);
1735 logf (LOG_WARN, "missing index after unread command");
1738 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1740 logf (LOG_WARN, "bad index after unread command");
1745 no = *cmd_str - '0';
1746 if (no >= spec->arg_no)
1747 no = spec->arg_no - 1;
1748 spec->ptr = spec->arg_start[no] + offset;
1750 r = execTok (spec, &s, &cmd_str, &cmd_len);
1752 else if (!strcmp (p, "context"))
1756 struct lexContext *lc = spec->context;
1757 r = execTok (spec, &s, &cmd_str, &cmd_len);
1758 p = regxStrz (cmd_str, cmd_len, ptmp);
1760 while (lc && strcmp (p, lc->name))
1763 spec->context_stack[spec->context_stack_top] = lc;
1765 logf (LOG_WARN, "unknown context %s", p);
1768 r = execTok (spec, &s, &cmd_str, &cmd_len);
1772 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1773 r = execTok (spec, &s, &cmd_str, &cmd_len);
1778 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1780 r = execTok (spec, &s, &cmd_str, &cmd_len);
1787 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1788 int start_ptr, int *pptr)
1797 arg_start[0] = start_ptr;
1799 spec->arg_start = arg_start;
1800 spec->arg_end = arg_end;
1807 if (ap->u.pattern.body)
1809 arg_start[arg_no] = *pptr;
1810 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1812 arg_end[arg_no] = F_WIN_EOF;
1814 arg_start[arg_no] = F_WIN_EOF;
1815 arg_end[arg_no] = F_WIN_EOF;
1820 arg_end[arg_no] = sptr;
1822 arg_start[arg_no] = sptr;
1823 arg_end[arg_no] = *pptr;
1828 arg_start[arg_no] = *pptr;
1829 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1831 if (sptr != arg_start[arg_no])
1833 arg_end[arg_no] = *pptr;
1838 spec->arg_no = arg_no;
1841 if (spec->tcl_interp)
1842 execTcl(spec, ap->u.code);
1844 execCode (spec, ap->u.code);
1846 execCode (spec, ap->u.code);
1849 if (spec->stop_flag)
1853 arg_start[arg_no] = *pptr;
1854 arg_end[arg_no] = F_WIN_EOF;
1863 static int execRule (struct lexSpec *spec, struct lexContext *context,
1864 int ruleNo, int start_ptr, int *pptr)
1867 logf (LOG_LOG, "exec rule %d", ruleNo);
1869 return execAction (spec, context->fastRule[ruleNo]->actionList,
1873 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1875 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1876 struct DFA_state *state = context->dfa->states[0];
1879 unsigned char c_prev = '\n';
1881 int last_rule = 0; /* rule number of current match */
1882 int last_ptr = *ptr; /* last char of match */
1883 int start_ptr = *ptr; /* first char of match */
1884 int skip_ptr = *ptr; /* first char of run */
1888 c = f_win_advance (spec, ptr);
1889 if (*ptr == F_WIN_EOF)
1891 /* end of file met */
1894 /* there was a match */
1895 if (skip_ptr < start_ptr)
1897 /* deal with chars that didn't match */
1900 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1901 execDataP (spec, buf, size, 0);
1903 /* restore pointer */
1906 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1908 /* restore skip pointer */
1912 else if (skip_ptr < *ptr)
1914 /* deal with chars that didn't match */
1917 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1918 execDataP (spec, buf, size, 0);
1920 if (*ptr == F_WIN_EOF)
1927 { /* no transition for character c ... */
1930 if (skip_ptr < start_ptr)
1932 /* deal with chars that didn't match */
1935 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1936 execDataP (spec, buf, size, 0);
1938 /* restore pointer */
1940 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1942 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1945 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1947 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1951 context = spec->context_stack[spec->context_stack_top];
1954 last_ptr = start_ptr = *ptr;
1958 c_prev = f_win_advance (spec, &start_ptr);
1963 c_prev = f_win_advance (spec, &start_ptr);
1966 state = context->dfa->states[0];
1969 else if (c >= t->ch[0] && c <= t->ch[1])
1970 { /* transition ... */
1971 state = context->dfa->states[t->to];
1976 last_rule = state->rule_no;
1979 else if (state->rule_nno)
1981 last_rule = state->rule_nno;
1993 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1994 const char *context_name)
1996 struct lexContext *lt = spec->context;
1999 spec->stop_flag = 0;
2001 spec->context_stack_top = 0;
2004 if (!strcmp (lt->name, context_name))
2010 logf (LOG_WARN, "cannot find context %s", context_name);
2013 spec->context_stack[spec->context_stack_top] = lt;
2014 spec->d1_stack[spec->d1_level] = NULL;
2019 execAction (spec, lt->initActionList, ptr, &ptr);
2022 execAction (spec, lt->beginActionList, ptr, &ptr);
2023 lexNode (spec, &ptr);
2024 while (spec->d1_level)
2026 tagDataRelease (spec);
2029 execAction (spec, lt->endActionList, ptr, &ptr);
2030 return spec->d1_stack[0];
2033 void grs_destroy(void *clientData)
2035 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2038 lexSpecDestroy(&specs->spec);
2043 void *grs_init(void)
2045 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2050 data1_node *grs_read_regx (struct grs_read_info *p)
2053 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2054 struct lexSpec **curLexSpec = &specs->spec;
2057 logf (LOG_LOG, "grs_read_regx");
2059 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2062 lexSpecDestroy (curLexSpec);
2063 *curLexSpec = lexSpecCreate (p->type, p->dh);
2064 res = readFileSpec (*curLexSpec);
2067 lexSpecDestroy (curLexSpec);
2071 (*curLexSpec)->dh = p->dh;
2074 (*curLexSpec)->f_win_start = 0;
2075 (*curLexSpec)->f_win_end = 0;
2076 (*curLexSpec)->f_win_rf = p->readf;
2077 (*curLexSpec)->f_win_sf = p->seekf;
2078 (*curLexSpec)->f_win_fh = p->fh;
2079 (*curLexSpec)->f_win_ef = p->endf;
2080 (*curLexSpec)->f_win_size = 500000;
2082 (*curLexSpec)->m = p->mem;
2083 return lexRoot (*curLexSpec, p->offset, "main");
2086 static struct recTypeGrs regx_type = {
2093 RecTypeGrs recTypeGrs_regx = ®x_type;
2096 data1_node *grs_read_tcl (struct grs_read_info *p)
2099 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2100 struct lexSpec **curLexSpec = &specs->spec;
2103 logf (LOG_LOG, "grs_read_tcl");
2105 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2107 Tcl_Interp *tcl_interp;
2109 lexSpecDestroy (curLexSpec);
2110 *curLexSpec = lexSpecCreate (p->type, p->dh);
2111 Tcl_FindExecutable("");
2112 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2113 Tcl_Init(tcl_interp);
2114 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2115 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2116 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2117 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2119 res = readFileSpec (*curLexSpec);
2122 lexSpecDestroy (curLexSpec);
2126 (*curLexSpec)->dh = p->dh;
2129 (*curLexSpec)->f_win_start = 0;
2130 (*curLexSpec)->f_win_end = 0;
2131 (*curLexSpec)->f_win_rf = p->readf;
2132 (*curLexSpec)->f_win_sf = p->seekf;
2133 (*curLexSpec)->f_win_fh = p->fh;
2134 (*curLexSpec)->f_win_ef = p->endf;
2135 (*curLexSpec)->f_win_size = 500000;
2137 (*curLexSpec)->m = p->mem;
2138 return lexRoot (*curLexSpec, p->offset, "main");
2141 static struct recTypeGrs tcl_type = {
2148 RecTypeGrs recTypeGrs_tcl = &tcl_type;