2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.34 2000-11-29 14:24:01 adam
8 * Script configure uses yaz pthreads options. Added locking for
9 * zebra_register_{lock,unlock}.
11 * Revision 1.33 1999/11/30 13:48:04 adam
12 * Improved installation. Updated for inclusion of YAZ header files.
14 * Revision 1.32 1999/09/07 07:19:21 adam
15 * Work on character mapping. Implemented replace rules.
17 * Revision 1.31 1999/07/14 13:05:29 adam
18 * Tcl filter works with objects when TCL is version 8 or later; filter
19 * works with strings otherwise (slow).
21 * Revision 1.30 1999/07/14 10:55:28 adam
24 * Revision 1.29 1999/07/12 07:27:54 adam
25 * Improved speed of Tcl processing. Fixed one memory leak.
27 * Revision 1.28 1999/07/06 12:26:04 adam
28 * Fixed filters so that MS-DOS CR is ignored.
30 * Revision 1.27 1999/06/28 13:25:40 quinn
31 * Improved diagnostics for Tcl
33 * Revision 1.26 1999/05/26 07:49:14 adam
36 * Revision 1.25 1999/05/25 12:33:32 adam
37 * Fixed bug in Tcl filter.
39 * Revision 1.24 1999/05/21 11:08:46 adam
40 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
41 * script so that it reads uninstalled Tcl source.
43 * Revision 1.23 1999/05/20 12:57:18 adam
44 * Implemented TCL filter. Updated recctrl system.
46 * Revision 1.22 1998/11/03 16:07:13 adam
49 * Revision 1.21 1998/11/03 15:43:39 adam
50 * Fixed bug introduced by previous commit.
52 * Revision 1.20 1998/11/03 14:51:28 adam
53 * Changed code so that it creates as few data1 nodes as possible.
55 * Revision 1.19 1998/11/03 10:22:39 adam
56 * Fixed memory leak that could occur for when large data1 node were
57 * concatenated. Data-type data1_nodes may have multiple nodes.
59 * Revision 1.18 1998/10/15 13:11:47 adam
60 * Added support for option -record for "end element". When specified
61 * end element will mark end-of-record when at outer-level.
63 * Revision 1.17 1998/07/01 10:13:51 adam
66 * Revision 1.16 1998/06/30 15:15:09 adam
67 * Tags are trimmed: white space removed before- and after the tag.
69 * Revision 1.15 1998/06/30 12:55:45 adam
72 * Revision 1.14 1998/03/05 08:41:00 adam
73 * Implemented rule contexts.
75 * Revision 1.13 1997/12/12 06:33:58 adam
76 * Fixed bug that showed up when multiple filter where used.
77 * Made one routine thread-safe.
79 * Revision 1.12 1997/11/18 10:03:24 adam
80 * Member num_children removed from data1_node.
82 * Revision 1.11 1997/11/06 11:41:01 adam
83 * Implemented "begin variant" for the sgml.regx filter.
85 * Revision 1.10 1997/10/31 12:36:12 adam
86 * Minor change that avoids compiler warning.
88 * Revision 1.9 1997/09/29 09:02:49 adam
89 * Fixed small bug (introduced by previous commit).
91 * Revision 1.8 1997/09/17 12:19:22 adam
92 * Zebra version corresponds to YAZ version 1.4.
93 * Changed Zebra server so that it doesn't depend on global common_resource.
95 * Revision 1.7 1997/07/15 16:33:07 adam
96 * Check for zero length in execData.
98 * Revision 1.6 1997/02/24 10:41:51 adam
99 * Cleanup of code and commented out the "end element-end-record" code.
101 * Revision 1.5 1997/02/19 16:22:33 adam
102 * Fixed "end element" to terminate record in outer-most level.
104 * Revision 1.4 1997/02/12 20:42:58 adam
105 * Changed some log messages.
107 * Revision 1.3 1996/11/08 14:05:33 adam
108 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
110 * Revision 1.2 1996/10/29 14:02:09 adam
111 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
112 * data1_get_tabpath is used.
114 * Revision 1.1 1996/10/11 10:57:30 adam
115 * New module recctrl. Used to manage records (extract/retrieval).
117 * Revision 1.24 1996/06/17 14:25:31 adam
118 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
120 * Revision 1.23 1996/06/04 10:19:00 adam
121 * Minor changes - removed include of ctype.h.
123 * Revision 1.22 1996/06/03 15:23:13 adam
124 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
126 * Revision 1.21 1996/05/14 16:58:38 adam
129 * Revision 1.20 1996/05/01 13:46:36 adam
130 * First work on multiple records in one file.
131 * New option, -offset, to the "unread" command in the filter module.
133 * Revision 1.19 1996/02/12 16:18:20 adam
134 * Yet another bug fix in implementation of unread command.
136 * Revision 1.18 1996/02/12 16:07:54 adam
137 * Bug fix in new unread command.
139 * Revision 1.17 1996/02/12 15:56:11 adam
140 * New code command: unread.
142 * Revision 1.16 1996/01/17 14:57:51 adam
143 * Prototype changed for reader functions in extract/retrieve. File
144 * is identified by 'void *' instead of 'int.
146 * Revision 1.15 1996/01/08 19:15:47 adam
147 * New input filter that works!
149 * Revision 1.14 1996/01/08 09:10:38 adam
150 * Yet another complete rework on this module.
152 * Revision 1.13 1995/12/15 17:21:50 adam
153 * This version is able to set data.formatted_text in data1-nodes.
155 * Revision 1.12 1995/12/15 16:20:10 adam
156 * The filter files (*.flt) are read from the path given by data1_tabpath.
158 * Revision 1.11 1995/12/15 12:35:16 adam
161 * Revision 1.10 1995/12/15 10:35:36 adam
164 * Revision 1.9 1995/12/14 16:38:48 adam
165 * Completely new attempt to make regular expression parsing.
167 * Revision 1.8 1995/12/13 17:16:59 adam
170 * Revision 1.7 1995/12/13 16:51:58 adam
171 * Modified to set last_child in data1_nodes.
172 * Uses destroy handler to free up data text nodes.
174 * Revision 1.6 1995/12/13 13:45:37 quinn
175 * Changed data1 to use nmem.
177 * Revision 1.5 1995/12/11 09:12:52 adam
178 * The rec_get function returns NULL if record doesn't exist - will
179 * happen in the server if the result set records have been deleted since
180 * the creation of the set (i.e. the search).
181 * The server saves a result temporarily if it is 'volatile', i.e. the
182 * set is register dependent.
184 * Revision 1.4 1995/12/05 16:57:40 adam
185 * More work on regular patterns.
187 * Revision 1.3 1995/12/05 09:37:09 adam
188 * One malloc was renamed to xmalloc.
190 * Revision 1.2 1995/12/04 17:59:24 adam
191 * More work on regular expression conversion.
193 * Revision 1.1 1995/12/04 14:25:30 adam
194 * Started work on regular expression parsed input to structured records.
202 #include <yaz/tpath.h>
203 #include <zebrautl.h>
210 #if MAJOR_VERSION >= 8
211 #define HAVE_TCL_OBJECTS
217 #define F_WIN_EOF 2000000000
221 #define REGX_PATTERN 1
226 #define REGX_CONTEXT 6
236 struct lexRuleAction {
240 struct DFA *dfa; /* REGX_PATTERN */
243 struct regxCode *code; /* REGX_CODE */
245 struct lexRuleAction *next;
250 struct lexRuleAction *actionList;
254 struct lexRuleInfo info;
255 struct lexRule *next;
261 struct lexRule *rules;
262 struct lexRuleInfo **fastRule;
266 struct lexRuleAction *beginActionList;
267 struct lexRuleAction *endActionList;
268 struct lexRuleAction *initActionList;
269 struct lexContext *next;
272 struct lexConcatBuf {
279 struct lexContext *context;
281 struct lexContext **context_stack;
282 int context_stack_size;
283 int context_stack_top;
289 Tcl_Interp *tcl_interp;
292 void (*f_win_ef)(void *, off_t);
294 int f_win_start; /* first byte of buffer is this file offset */
295 int f_win_end; /* last byte of buffer is this offset - 1 */
296 int f_win_size; /* size of buffer */
297 char *f_win_buf; /* buffer itself */
298 int (*f_win_rf)(void *, char *, size_t);
299 off_t (*f_win_sf)(void *, off_t);
301 struct lexConcatBuf *concatBuf;
303 data1_node **d1_stack;
314 struct lexSpec *spec;
317 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
320 int i, r, off = start_pos - spec->f_win_start;
322 if (off >= 0 && end_pos <= spec->f_win_end)
324 *size = end_pos - start_pos;
325 return spec->f_win_buf + off;
327 if (off < 0 || start_pos >= spec->f_win_end)
329 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
330 spec->f_win_start = start_pos;
332 if (!spec->f_win_buf)
333 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
334 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
336 spec->f_win_end = spec->f_win_start + *size;
338 if (*size > end_pos - start_pos)
339 *size = end_pos - start_pos;
340 return spec->f_win_buf;
342 for (i = 0; i<spec->f_win_end - start_pos; i++)
343 spec->f_win_buf[i] = spec->f_win_buf[i + off];
344 r = (*spec->f_win_rf)(spec->f_win_fh,
346 spec->f_win_size - i);
347 spec->f_win_start = start_pos;
348 spec->f_win_end += r;
350 if (*size > end_pos - start_pos)
351 *size = end_pos - start_pos;
352 return spec->f_win_buf;
355 static int f_win_advance (struct lexSpec *spec, int *pos)
360 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
361 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
362 if (*pos == F_WIN_EOF)
364 buf = f_win_get (spec, *pos, *pos+1, &size);
374 static void regxCodeDel (struct regxCode **pp)
376 struct regxCode *p = *pp;
381 Tcl_DecrRefCount (p->tcl_obj);
389 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
393 p = (struct regxCode *) xmalloc (sizeof(*p));
394 p->str = (char *) xmalloc (len+1);
395 memcpy (p->str, buf, len);
398 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
400 Tcl_IncrRefCount (p->tcl_obj);
405 static struct DFA *lexSpecDFA (void)
410 dfa_parse_cmap_del (dfa, ' ');
411 dfa_parse_cmap_del (dfa, '\t');
412 dfa_parse_cmap_add (dfa, '/', 0);
416 static void actionListDel (struct lexRuleAction **rap)
418 struct lexRuleAction *ra1, *ra;
420 for (ra = *rap; ra; ra = ra1)
426 dfa_delete (&ra->u.pattern.dfa);
429 regxCodeDel (&ra->u.code);
437 static struct lexContext *lexContextCreate (const char *name)
439 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
441 p->name = xstrdup (name);
444 p->dfa = lexSpecDFA ();
447 p->beginActionList = NULL;
448 p->endActionList = NULL;
449 p->initActionList = NULL;
454 static void lexContextDestroy (struct lexContext *p)
456 struct lexRule *rp, *rp1;
458 dfa_delete (&p->dfa);
460 for (rp = p->rules; rp; rp = rp1)
463 actionListDel (&rp->info.actionList);
466 actionListDel (&p->beginActionList);
467 actionListDel (&p->endActionList);
468 actionListDel (&p->initActionList);
473 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
478 p = (struct lexSpec *) xmalloc (sizeof(*p));
479 p->name = (char *) xmalloc (strlen(name)+1);
480 strcpy (p->name, name);
487 p->context_stack_size = 100;
488 p->context_stack = (struct lexContext **)
489 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
493 p->concatBuf = (struct lexConcatBuf *)
494 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
495 for (i = 0; i < p->maxLevel; i++)
497 p->concatBuf[i].max = 0;
498 p->concatBuf[i].buf = 0;
500 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
505 static void lexSpecDestroy (struct lexSpec **pp)
508 struct lexContext *lt;
516 for (i = 0; i < p->maxLevel; i++)
517 xfree (p->concatBuf[i].buf);
518 xfree (p->concatBuf);
523 struct lexContext *lt_next = lt->next;
524 lexContextDestroy (lt);
529 Tcl_DeleteInterp (p->tcl_interp);
532 xfree (p->f_win_buf);
533 xfree (p->context_stack);
539 static int readParseToken (const char **cpp, int *len)
541 const char *cp = *cpp;
545 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
574 if (*cp >= 'a' && *cp <= 'z')
576 else if (*cp >= 'A' && *cp <= 'Z')
577 cmd[i] = *cp + 'a' - 'A';
580 if (i < (int) sizeof(cmd)-2)
587 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
589 while (*cp && *cp != ' ' && *cp != '\t' &&
590 *cp != '\n' && *cp != '\r')
596 if (!strcmp (cmd, "begin"))
598 else if (!strcmp (cmd, "end"))
600 else if (!strcmp (cmd, "body"))
602 else if (!strcmp (cmd, "context"))
604 else if (!strcmp (cmd, "init"))
608 logf (LOG_WARN, "bad command %s", cmd);
614 static int actionListMk (struct lexSpec *spec, const char *s,
615 struct lexRuleAction **ap)
621 while ((tok = readParseToken (&s, &len)))
629 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
631 regxCodeMk (&(*ap)->u.code, s, len);
635 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
637 (*ap)->u.pattern.body = bodyMark;
639 (*ap)->u.pattern.dfa = lexSpecDFA ();
641 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
646 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
649 dfa_mkstate ((*ap)->u.pattern.dfa);
653 logf (LOG_WARN, "cannot use BEGIN here");
656 logf (LOG_WARN, "cannot use INIT here");
659 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
669 int readOneSpec (struct lexSpec *spec, const char *s)
673 struct lexContext *lc;
675 tok = readParseToken (&s, &len);
676 if (tok == REGX_CONTEXT)
678 char context_name[32];
679 tok = readParseToken (&s, &len);
680 if (tok != REGX_CODE)
682 logf (LOG_WARN, "missing name after CONTEXT keyword");
687 memcpy (context_name, s, len);
688 context_name[len] = '\0';
689 lc = lexContextCreate (context_name);
690 lc->next = spec->context;
695 spec->context = lexContextCreate ("main");
700 actionListDel (&spec->context->beginActionList);
701 actionListMk (spec, s, &spec->context->beginActionList);
704 actionListDel (&spec->context->endActionList);
705 actionListMk (spec, s, &spec->context->endActionList);
708 actionListDel (&spec->context->initActionList);
709 actionListMk (spec, s, &spec->context->initActionList);
713 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
715 r = dfa_parse (spec->context->dfa, &s);
718 logf (LOG_WARN, "regular expression error. r=%d", r);
723 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
727 rp = (struct lexRule *) xmalloc (sizeof(*rp));
728 rp->info.no = spec->context->ruleNo++;
729 rp->next = spec->context->rules;
730 spec->context->rules = rp;
731 actionListMk (spec, s, &rp->info.actionList);
736 int readFileSpec (struct lexSpec *spec)
738 struct lexContext *lc;
739 int c, i, errors = 0;
745 if (spec->tcl_interp)
747 sprintf (fname, "%s.tflt", spec->name);
748 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
753 sprintf (fname, "%s.flt", spec->name);
754 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
758 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
761 logf (LOG_LOG, "reading regx filter %s", fname);
763 if (spec->tcl_interp)
764 logf (LOG_LOG, "Tcl enabled");
766 lineBuf = wrbuf_alloc();
771 wrbuf_rewind (lineBuf);
772 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
774 while (c != '\n' && c != EOF)
787 wrbuf_putc(lineBuf, c);
795 if (c != ' ' && c != '\t')
800 wrbuf_putc(lineBuf, '\0');
801 readOneSpec (spec, wrbuf_buf(lineBuf));
802 spec->lineNo += addLine;
806 wrbuf_free(lineBuf, 1);
811 debug_dfa_followpos = 1;
814 for (lc = spec->context; lc; lc = lc->next)
817 lc->fastRule = (struct lexRuleInfo **)
818 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
819 for (i = 0; i < lc->ruleNo; i++)
820 lc->fastRule[i] = NULL;
821 for (rp = lc->rules; rp; rp = rp->next)
822 lc->fastRule[rp->info.no] = &rp->info;
823 dfa_mkstate (lc->dfa);
832 static struct lexSpec *curLexSpec = NULL;
835 static void execData (struct lexSpec *spec,
836 const char *ebuf, int elen, int formatted_text)
838 struct data1_node *res, *parent;
841 if (elen == 0) /* shouldn't happen, but it does! */
845 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
846 ebuf, 15, ebuf + elen-15);
848 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
850 logf (LOG_DEBUG, "data (%d bytes)", elen);
853 if (spec->d1_level <= 1)
856 parent = spec->d1_stack[spec->d1_level -1];
859 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
860 org_len = res->u.data.len;
865 res = data1_mk_node (spec->dh, spec->m);
866 res->parent = parent;
867 res->which = DATA1N_data;
868 res->u.data.what = DATA1I_text;
870 res->u.data.formatted_text = formatted_text;
872 if (elen > DATA1_LOCALDATA)
873 res->u.data.data = nmem_malloc (spec->m, elen);
875 res->u.data.data = res->lbuf;
876 memcpy (res->u.data.data, ebuf, elen);
878 res->u.data.data = 0;
880 res->root = parent->root;
882 parent->last_child = res;
883 if (spec->d1_stack[spec->d1_level])
884 spec->d1_stack[spec->d1_level]->next = res;
887 spec->d1_stack[spec->d1_level] = res;
889 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
891 char *old_buf, *new_buf;
893 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
894 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
895 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
897 memcpy (new_buf, old_buf, org_len);
900 spec->concatBuf[spec->d1_level].buf = new_buf;
902 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
903 res->u.data.len += elen;
906 static void execDataP (struct lexSpec *spec,
907 const char *ebuf, int elen, int formatted_text)
909 execData (spec, ebuf, elen, formatted_text);
912 static void tagDataRelease (struct lexSpec *spec)
916 if ((res = spec->d1_stack[spec->d1_level]) &&
917 res->which == DATA1N_data &&
918 res->u.data.what == DATA1I_text)
920 assert (!res->u.data.data);
921 assert (res->u.data.len > 0);
922 if (res->u.data.len > DATA1_LOCALDATA)
923 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
925 res->u.data.data = res->lbuf;
926 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
931 static void variantBegin (struct lexSpec *spec,
932 const char *class_str, int class_len,
933 const char *type_str, int type_len,
934 const char *value_str, int value_len)
936 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
937 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
942 if (spec->d1_level == 0)
944 logf (LOG_WARN, "in variant begin. No record type defined");
947 if (class_len >= DATA1_MAX_SYMBOL)
948 class_len = DATA1_MAX_SYMBOL-1;
949 memcpy (tclass, class_str, class_len);
950 tclass[class_len] = '\0';
952 if (type_len >= DATA1_MAX_SYMBOL)
953 type_len = DATA1_MAX_SYMBOL-1;
954 memcpy (ttype, type_str, type_len);
955 ttype[type_len] = '\0';
958 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
963 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
967 if (parent->which != DATA1N_variant)
969 res = data1_mk_node (spec->dh, spec->m);
970 res->parent = parent;
971 res->which = DATA1N_variant;
972 res->u.variant.type = 0;
973 res->u.variant.value = 0;
974 res->root = parent->root;
976 parent->last_child = res;
977 if (spec->d1_stack[spec->d1_level])
979 tagDataRelease (spec);
980 spec->d1_stack[spec->d1_level]->next = res;
984 spec->d1_stack[spec->d1_level] = res;
985 spec->d1_stack[++(spec->d1_level)] = NULL;
987 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
988 if (spec->d1_stack[i]->u.variant.type == tp)
995 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
997 parent = spec->d1_stack[spec->d1_level-1];
998 res = data1_mk_node (spec->dh, spec->m);
999 res->parent = parent;
1000 res->which = DATA1N_variant;
1001 res->root = parent->root;
1002 res->u.variant.type = tp;
1004 if (value_len >= DATA1_LOCALDATA)
1005 value_len =DATA1_LOCALDATA-1;
1006 memcpy (res->lbuf, value_str, value_len);
1007 res->lbuf[value_len] = '\0';
1009 res->u.variant.value = res->lbuf;
1011 parent->last_child = res;
1012 if (spec->d1_stack[spec->d1_level])
1014 tagDataRelease (spec);
1015 spec->d1_stack[spec->d1_level]->next = res;
1018 parent->child = res;
1019 spec->d1_stack[spec->d1_level] = res;
1020 spec->d1_stack[++(spec->d1_level)] = NULL;
1023 static void tagStrip (const char **tag, int *len)
1027 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1030 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1036 static void tagBegin (struct lexSpec *spec,
1037 const char *tag, int len)
1039 struct data1_node *parent;
1040 data1_element *elem = NULL;
1043 data1_element *e = NULL;
1046 if (spec->d1_level == 0)
1048 logf (LOG_WARN, "in element begin. No record type defined");
1051 tagStrip (&tag, &len);
1053 parent = spec->d1_stack[spec->d1_level -1];
1054 partag = get_parent_tag(spec->dh, parent);
1056 res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
1057 res->parent = parent;
1059 if (len >= DATA1_LOCALDATA)
1060 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1062 res->u.tag.tag = res->lbuf;
1064 memcpy (res->u.tag.tag, tag, len);
1065 res->u.tag.tag[len] = '\0';
1068 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1070 if (parent->which == DATA1N_variant)
1073 if (!(e = partag->u.tag.element))
1076 elem = data1_getelementbytagname (spec->dh,
1077 spec->d1_stack[0]->u.root.absyn,
1079 res->u.tag.element = elem;
1080 res->root = parent->root;
1082 parent->last_child = res;
1083 if (spec->d1_stack[spec->d1_level])
1085 tagDataRelease (spec);
1086 spec->d1_stack[spec->d1_level]->next = res;
1089 parent->child = res;
1090 spec->d1_stack[spec->d1_level] = res;
1091 spec->d1_stack[++(spec->d1_level)] = NULL;
1094 static void tagEnd (struct lexSpec *spec, int min_level,
1095 const char *tag, int len)
1097 tagStrip (&tag, &len);
1098 while (spec->d1_level > min_level)
1100 tagDataRelease (spec);
1102 if (spec->d1_level == 0)
1104 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1106 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1108 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1112 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1117 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1120 struct DFA_state *state = dfa->states[0];
1123 unsigned char c_prev = 0;
1124 int ptr = *pptr; /* current pointer */
1125 int start_ptr = *pptr; /* first char of match */
1126 int last_ptr = 0; /* last char of match */
1127 int last_rule = 0; /* rule number of current match */
1132 c = f_win_advance (spec, &ptr);
1133 if (ptr == F_WIN_EOF)
1150 *mptr = start_ptr; /* match starts here */
1151 *pptr = last_ptr; /* match end here (+1) */
1154 state = dfa->states[0];
1159 else if (c >= t->ch[0] && c <= t->ch[1])
1161 state = dfa->states[t->to];
1166 last_rule = state->rule_no;
1171 last_rule = state->rule_nno;
1183 static int execTok (struct lexSpec *spec, const char **src,
1184 const char **tokBuf, int *tokLen)
1186 const char *s = *src;
1188 while (*s == ' ' || *s == '\t')
1192 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1196 while (*s >= '0' && *s <= '9')
1197 n = n*10 + (*s++ -'0');
1198 if (spec->arg_no == 0)
1205 if (n >= spec->arg_no)
1207 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1211 else if (*s == '\"')
1214 while (*s && *s != '\"')
1216 *tokLen = s - *tokBuf;
1221 else if (*s == '\n' || *s == ';')
1229 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1232 *tokLen = s - *tokBuf;
1239 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1242 *tokLen = s - *tokBuf;
1248 static char *regxStrz (const char *src, int len, char *str)
1252 memcpy (str, src, len);
1258 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1259 int argc, char **argv)
1261 struct lexSpec *spec = (struct lexSpec *) clientData;
1264 if (!strcmp(argv[1], "record") && argc == 3)
1266 char *absynName = argv[2];
1270 logf (LOG_DEBUG, "begin record %s", absynName);
1272 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1273 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1278 res = data1_mk_node (spec->dh, spec->m);
1279 res->which = DATA1N_root;
1280 res->u.root.type = absynName;
1281 res->u.root.absyn = absyn;
1284 spec->d1_stack[spec->d1_level] = res;
1285 spec->d1_stack[++(spec->d1_level)] = NULL;
1288 else if (!strcmp(argv[1], "element") && argc == 3)
1290 tagBegin (spec, argv[2], strlen(argv[2]));
1292 else if (!strcmp (argv[1], "variant") && argc == 5)
1294 variantBegin (spec, argv[2], strlen(argv[2]),
1295 argv[3], strlen(argv[3]),
1296 argv[4], strlen(argv[4]));
1298 else if (!strcmp (argv[1], "context") && argc == 3)
1300 struct lexContext *lc = spec->context;
1302 logf (LOG_DEBUG, "begin context %s",argv[2]);
1304 while (lc && strcmp (argv[2], lc->name))
1308 spec->context_stack[++(spec->context_stack_top)] = lc;
1311 logf (LOG_WARN, "unknown context %s", argv[2]);
1318 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1319 int argc, char **argv)
1321 struct lexSpec *spec = (struct lexSpec *) clientData;
1325 if (!strcmp (argv[1], "record"))
1327 while (spec->d1_level)
1329 tagDataRelease (spec);
1333 logf (LOG_DEBUG, "end record");
1335 spec->stop_flag = 1;
1337 else if (!strcmp (argv[1], "element"))
1341 if (argc >= 3 && !strcmp(argv[2], "-record"))
1350 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1351 if (spec->d1_level == 0)
1354 logf (LOG_DEBUG, "end element end records");
1356 spec->stop_flag = 1;
1359 else if (!strcmp (argv[1], "context"))
1362 logf (LOG_DEBUG, "end context");
1364 if (spec->context_stack_top)
1365 (spec->context_stack_top)--;
1372 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1373 int argc, char **argv)
1377 const char *element = 0;
1378 struct lexSpec *spec = (struct lexSpec *) clientData;
1382 if (!strcmp("-text", argv[argi]))
1387 else if (!strcmp("-element", argv[argi]))
1391 element = argv[argi++];
1397 tagBegin (spec, element, strlen(element));
1401 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1405 tagEnd (spec, 1, NULL, 0);
1409 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1410 int argc, char **argv)
1412 struct lexSpec *spec = (struct lexSpec *) clientData;
1419 if (!strcmp("-offset", argv[argi]))
1424 offset = atoi(argv[argi]);
1433 no = atoi(argv[argi]);
1434 if (no >= spec->arg_no)
1435 no = spec->arg_no - 1;
1436 spec->ptr = spec->arg_start[no] + offset;
1440 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1444 for (i = 0; i < spec->arg_no; i++)
1446 char var_name[10], *var_buf;
1449 sprintf (var_name, "%d", i);
1450 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1454 ch = var_buf[var_len];
1455 var_buf[var_len] = '\0';
1456 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1457 var_buf[var_len] = ch;
1460 #if HAVE_TCL_OBJECTS
1461 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1463 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1467 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1468 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1469 spec->tcl_interp->errorLine,
1470 spec->tcl_interp->result,
1471 err ? err : "[NO ERRORINFO]");
1477 static void execCode (struct lexSpec *spec, struct regxCode *code)
1479 const char *s = code->str;
1481 const char *cmd_str;
1483 r = execTok (spec, &s, &cmd_str, &cmd_len);
1490 r = execTok (spec, &s, &cmd_str, &cmd_len);
1493 p = regxStrz (cmd_str, cmd_len, ptmp);
1494 if (!strcmp (p, "begin"))
1496 r = execTok (spec, &s, &cmd_str, &cmd_len);
1499 logf (LOG_WARN, "missing keyword after 'begin'");
1502 p = regxStrz (cmd_str, cmd_len, ptmp);
1503 if (!strcmp (p, "record"))
1505 r = execTok (spec, &s, &cmd_str, &cmd_len);
1508 if (spec->d1_level == 0)
1510 static char absynName[64];
1515 memcpy (absynName, cmd_str, cmd_len);
1516 absynName[cmd_len] = '\0';
1519 logf (LOG_DEBUG, "begin record %s", absynName);
1521 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1522 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1527 res = data1_mk_node (spec->dh, spec->m);
1528 res->which = DATA1N_root;
1529 res->u.root.type = absynName;
1530 res->u.root.absyn = absyn;
1533 spec->d1_stack[spec->d1_level] = res;
1534 spec->d1_stack[++(spec->d1_level)] = NULL;
1537 r = execTok (spec, &s, &cmd_str, &cmd_len);
1539 else if (!strcmp (p, "element"))
1541 r = execTok (spec, &s, &cmd_str, &cmd_len);
1544 tagBegin (spec, cmd_str, cmd_len);
1545 r = execTok (spec, &s, &cmd_str, &cmd_len);
1547 else if (!strcmp (p, "variant"))
1550 const char *class_str = NULL;
1552 const char *type_str = NULL;
1554 const char *value_str = NULL;
1555 r = execTok (spec, &s, &cmd_str, &cmd_len);
1558 class_str = cmd_str;
1559 class_len = cmd_len;
1560 r = execTok (spec, &s, &cmd_str, &cmd_len);
1566 r = execTok (spec, &s, &cmd_str, &cmd_len);
1569 value_str = cmd_str;
1570 value_len = cmd_len;
1572 variantBegin (spec, class_str, class_len,
1573 type_str, type_len, value_str, value_len);
1576 r = execTok (spec, &s, &cmd_str, &cmd_len);
1578 else if (!strcmp (p, "context"))
1582 struct lexContext *lc = spec->context;
1583 r = execTok (spec, &s, &cmd_str, &cmd_len);
1584 p = regxStrz (cmd_str, cmd_len, ptmp);
1586 logf (LOG_DEBUG, "begin context %s", p);
1588 while (lc && strcmp (p, lc->name))
1591 spec->context_stack[++(spec->context_stack_top)] = lc;
1593 logf (LOG_WARN, "unknown context %s", p);
1596 r = execTok (spec, &s, &cmd_str, &cmd_len);
1600 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1603 else if (!strcmp (p, "end"))
1605 r = execTok (spec, &s, &cmd_str, &cmd_len);
1608 logf (LOG_WARN, "missing keyword after 'end'");
1611 p = regxStrz (cmd_str, cmd_len, ptmp);
1612 if (!strcmp (p, "record"))
1614 while (spec->d1_level)
1616 tagDataRelease (spec);
1619 r = execTok (spec, &s, &cmd_str, &cmd_len);
1621 logf (LOG_DEBUG, "end record");
1623 spec->stop_flag = 1;
1625 else if (!strcmp (p, "element"))
1628 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1630 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1635 tagEnd (spec, min_level, cmd_str, cmd_len);
1636 r = execTok (spec, &s, &cmd_str, &cmd_len);
1639 tagEnd (spec, min_level, NULL, 0);
1640 if (spec->d1_level == 0)
1643 logf (LOG_DEBUG, "end element end records");
1645 spec->stop_flag = 1;
1649 else if (!strcmp (p, "context"))
1652 logf (LOG_DEBUG, "end context");
1654 if (spec->context_stack_top)
1655 (spec->context_stack_top)--;
1656 r = execTok (spec, &s, &cmd_str, &cmd_len);
1659 logf (LOG_WARN, "bad keyword '%s' after end", p);
1661 else if (!strcmp (p, "data"))
1665 const char *element_str = NULL;
1667 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1669 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1671 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1673 r = execTok (spec, &s, &element_str, &element_len);
1678 logf (LOG_WARN, "bad data option: %.*s",
1683 logf (LOG_WARN, "missing data item after data");
1687 tagBegin (spec, element_str, element_len);
1690 execData (spec, cmd_str, cmd_len,textFlag);
1691 r = execTok (spec, &s, &cmd_str, &cmd_len);
1694 tagEnd (spec, 1, NULL, 0);
1696 else if (!strcmp (p, "unread"))
1699 r = execTok (spec, &s, &cmd_str, &cmd_len);
1700 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1702 r = execTok (spec, &s, &cmd_str, &cmd_len);
1705 logf (LOG_WARN, "missing number after -offset");
1708 p = regxStrz (cmd_str, cmd_len, ptmp);
1710 r = execTok (spec, &s, &cmd_str, &cmd_len);
1716 logf (LOG_WARN, "missing index after unread command");
1719 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1721 logf (LOG_WARN, "bad index after unread command");
1726 no = *cmd_str - '0';
1727 if (no >= spec->arg_no)
1728 no = spec->arg_no - 1;
1729 spec->ptr = spec->arg_start[no] + offset;
1731 r = execTok (spec, &s, &cmd_str, &cmd_len);
1733 else if (!strcmp (p, "context"))
1737 struct lexContext *lc = spec->context;
1738 r = execTok (spec, &s, &cmd_str, &cmd_len);
1739 p = regxStrz (cmd_str, cmd_len, ptmp);
1741 while (lc && strcmp (p, lc->name))
1744 spec->context_stack[spec->context_stack_top] = lc;
1746 logf (LOG_WARN, "unknown context %s", p);
1749 r = execTok (spec, &s, &cmd_str, &cmd_len);
1753 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1754 r = execTok (spec, &s, &cmd_str, &cmd_len);
1759 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1761 r = execTok (spec, &s, &cmd_str, &cmd_len);
1768 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1769 int start_ptr, int *pptr)
1778 arg_start[0] = start_ptr;
1780 spec->arg_start = arg_start;
1781 spec->arg_end = arg_end;
1788 if (ap->u.pattern.body)
1790 arg_start[arg_no] = *pptr;
1791 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1793 arg_end[arg_no] = F_WIN_EOF;
1795 arg_start[arg_no] = F_WIN_EOF;
1796 arg_end[arg_no] = F_WIN_EOF;
1801 arg_end[arg_no] = sptr;
1803 arg_start[arg_no] = sptr;
1804 arg_end[arg_no] = *pptr;
1809 arg_start[arg_no] = *pptr;
1810 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1812 if (sptr != arg_start[arg_no])
1814 arg_end[arg_no] = *pptr;
1819 spec->arg_no = arg_no;
1822 if (spec->tcl_interp)
1823 execTcl(spec, ap->u.code);
1825 execCode (spec, ap->u.code);
1827 execCode (spec, ap->u.code);
1830 if (spec->stop_flag)
1834 arg_start[arg_no] = *pptr;
1835 arg_end[arg_no] = F_WIN_EOF;
1844 static int execRule (struct lexSpec *spec, struct lexContext *context,
1845 int ruleNo, int start_ptr, int *pptr)
1848 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1850 return execAction (spec, context->fastRule[ruleNo]->actionList,
1854 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1856 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1857 struct DFA_state *state = context->dfa->states[0];
1860 unsigned char c_prev = '\n';
1862 int last_rule = 0; /* rule number of current match */
1863 int last_ptr = *ptr; /* last char of match */
1864 int start_ptr = *ptr; /* first char of match */
1865 int skip_ptr = *ptr; /* first char of run */
1869 c = f_win_advance (spec, ptr);
1870 if (*ptr == F_WIN_EOF)
1872 /* end of file met */
1875 /* there was a match */
1876 if (skip_ptr < start_ptr)
1878 /* deal with chars that didn't match */
1881 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1882 execDataP (spec, buf, size, 0);
1884 /* restore pointer */
1887 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1889 /* restore skip pointer */
1893 else if (skip_ptr < *ptr)
1895 /* deal with chars that didn't match */
1898 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1899 execDataP (spec, buf, size, 0);
1901 if (*ptr == F_WIN_EOF)
1908 { /* no transition for character c ... */
1911 if (skip_ptr < start_ptr)
1913 /* deal with chars that didn't match */
1916 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1917 execDataP (spec, buf, size, 0);
1919 /* restore pointer */
1921 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1923 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1926 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1928 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1932 context = spec->context_stack[spec->context_stack_top];
1935 last_ptr = start_ptr = *ptr;
1939 c_prev = f_win_advance (spec, &start_ptr);
1944 c_prev = f_win_advance (spec, &start_ptr);
1947 state = context->dfa->states[0];
1950 else if (c >= t->ch[0] && c <= t->ch[1])
1951 { /* transition ... */
1952 state = context->dfa->states[t->to];
1957 last_rule = state->rule_no;
1960 else if (state->rule_nno)
1962 last_rule = state->rule_nno;
1974 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1975 const char *context_name)
1977 struct lexContext *lt = spec->context;
1980 spec->stop_flag = 0;
1982 spec->context_stack_top = 0;
1985 if (!strcmp (lt->name, context_name))
1991 logf (LOG_WARN, "cannot find context %s", context_name);
1994 spec->context_stack[spec->context_stack_top] = lt;
1995 spec->d1_stack[spec->d1_level] = NULL;
2000 execAction (spec, lt->initActionList, ptr, &ptr);
2003 execAction (spec, lt->beginActionList, ptr, &ptr);
2004 lexNode (spec, &ptr);
2005 while (spec->d1_level)
2007 tagDataRelease (spec);
2010 execAction (spec, lt->endActionList, ptr, &ptr);
2011 return spec->d1_stack[0];
2014 void grs_destroy(void *clientData)
2016 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2019 lexSpecDestroy(&specs->spec);
2024 void *grs_init(void)
2026 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2031 data1_node *grs_read_regx (struct grs_read_info *p)
2034 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2035 struct lexSpec **curLexSpec = &specs->spec;
2038 logf (LOG_DEBUG, "grs_read_regx");
2040 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2043 lexSpecDestroy (curLexSpec);
2044 *curLexSpec = lexSpecCreate (p->type, p->dh);
2045 res = readFileSpec (*curLexSpec);
2048 lexSpecDestroy (curLexSpec);
2052 (*curLexSpec)->dh = p->dh;
2055 (*curLexSpec)->f_win_start = 0;
2056 (*curLexSpec)->f_win_end = 0;
2057 (*curLexSpec)->f_win_rf = p->readf;
2058 (*curLexSpec)->f_win_sf = p->seekf;
2059 (*curLexSpec)->f_win_fh = p->fh;
2060 (*curLexSpec)->f_win_ef = p->endf;
2061 (*curLexSpec)->f_win_size = 500000;
2063 (*curLexSpec)->m = p->mem;
2064 return lexRoot (*curLexSpec, p->offset, "main");
2067 static struct recTypeGrs regx_type = {
2074 RecTypeGrs recTypeGrs_regx = ®x_type;
2077 data1_node *grs_read_tcl (struct grs_read_info *p)
2080 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2081 struct lexSpec **curLexSpec = &specs->spec;
2084 logf (LOG_DEBUG, "grs_read_tcl");
2086 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2088 Tcl_Interp *tcl_interp;
2090 lexSpecDestroy (curLexSpec);
2091 *curLexSpec = lexSpecCreate (p->type, p->dh);
2092 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2093 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2094 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2095 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2096 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2098 res = readFileSpec (*curLexSpec);
2101 lexSpecDestroy (curLexSpec);
2105 (*curLexSpec)->dh = p->dh;
2108 (*curLexSpec)->f_win_start = 0;
2109 (*curLexSpec)->f_win_end = 0;
2110 (*curLexSpec)->f_win_rf = p->readf;
2111 (*curLexSpec)->f_win_sf = p->seekf;
2112 (*curLexSpec)->f_win_fh = p->fh;
2113 (*curLexSpec)->f_win_ef = p->endf;
2114 (*curLexSpec)->f_win_size = 500000;
2116 (*curLexSpec)->m = p->mem;
2117 return lexRoot (*curLexSpec, p->offset, "main");
2120 static struct recTypeGrs tcl_type = {
2127 RecTypeGrs recTypeGrs_tcl = &tcl_type;