2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.33 1999-11-30 13:48:04 adam
8 * Improved installation. Updated for inclusion of YAZ header files.
10 * Revision 1.32 1999/09/07 07:19:21 adam
11 * Work on character mapping. Implemented replace rules.
13 * Revision 1.31 1999/07/14 13:05:29 adam
14 * Tcl filter works with objects when TCL is version 8 or later; filter
15 * works with strings otherwise (slow).
17 * Revision 1.30 1999/07/14 10:55:28 adam
20 * Revision 1.29 1999/07/12 07:27:54 adam
21 * Improved speed of Tcl processing. Fixed one memory leak.
23 * Revision 1.28 1999/07/06 12:26:04 adam
24 * Fixed filters so that MS-DOS CR is ignored.
26 * Revision 1.27 1999/06/28 13:25:40 quinn
27 * Improved diagnostics for Tcl
29 * Revision 1.26 1999/05/26 07:49:14 adam
32 * Revision 1.25 1999/05/25 12:33:32 adam
33 * Fixed bug in Tcl filter.
35 * Revision 1.24 1999/05/21 11:08:46 adam
36 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
37 * script so that it reads uninstalled Tcl source.
39 * Revision 1.23 1999/05/20 12:57:18 adam
40 * Implemented TCL filter. Updated recctrl system.
42 * Revision 1.22 1998/11/03 16:07:13 adam
45 * Revision 1.21 1998/11/03 15:43:39 adam
46 * Fixed bug introduced by previous commit.
48 * Revision 1.20 1998/11/03 14:51:28 adam
49 * Changed code so that it creates as few data1 nodes as possible.
51 * Revision 1.19 1998/11/03 10:22:39 adam
52 * Fixed memory leak that could occur for when large data1 node were
53 * concatenated. Data-type data1_nodes may have multiple nodes.
55 * Revision 1.18 1998/10/15 13:11:47 adam
56 * Added support for option -record for "end element". When specified
57 * end element will mark end-of-record when at outer-level.
59 * Revision 1.17 1998/07/01 10:13:51 adam
62 * Revision 1.16 1998/06/30 15:15:09 adam
63 * Tags are trimmed: white space removed before- and after the tag.
65 * Revision 1.15 1998/06/30 12:55:45 adam
68 * Revision 1.14 1998/03/05 08:41:00 adam
69 * Implemented rule contexts.
71 * Revision 1.13 1997/12/12 06:33:58 adam
72 * Fixed bug that showed up when multiple filter where used.
73 * Made one routine thread-safe.
75 * Revision 1.12 1997/11/18 10:03:24 adam
76 * Member num_children removed from data1_node.
78 * Revision 1.11 1997/11/06 11:41:01 adam
79 * Implemented "begin variant" for the sgml.regx filter.
81 * Revision 1.10 1997/10/31 12:36:12 adam
82 * Minor change that avoids compiler warning.
84 * Revision 1.9 1997/09/29 09:02:49 adam
85 * Fixed small bug (introduced by previous commit).
87 * Revision 1.8 1997/09/17 12:19:22 adam
88 * Zebra version corresponds to YAZ version 1.4.
89 * Changed Zebra server so that it doesn't depend on global common_resource.
91 * Revision 1.7 1997/07/15 16:33:07 adam
92 * Check for zero length in execData.
94 * Revision 1.6 1997/02/24 10:41:51 adam
95 * Cleanup of code and commented out the "end element-end-record" code.
97 * Revision 1.5 1997/02/19 16:22:33 adam
98 * Fixed "end element" to terminate record in outer-most level.
100 * Revision 1.4 1997/02/12 20:42:58 adam
101 * Changed some log messages.
103 * Revision 1.3 1996/11/08 14:05:33 adam
104 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
106 * Revision 1.2 1996/10/29 14:02:09 adam
107 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
108 * data1_get_tabpath is used.
110 * Revision 1.1 1996/10/11 10:57:30 adam
111 * New module recctrl. Used to manage records (extract/retrieval).
113 * Revision 1.24 1996/06/17 14:25:31 adam
114 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
116 * Revision 1.23 1996/06/04 10:19:00 adam
117 * Minor changes - removed include of ctype.h.
119 * Revision 1.22 1996/06/03 15:23:13 adam
120 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
122 * Revision 1.21 1996/05/14 16:58:38 adam
125 * Revision 1.20 1996/05/01 13:46:36 adam
126 * First work on multiple records in one file.
127 * New option, -offset, to the "unread" command in the filter module.
129 * Revision 1.19 1996/02/12 16:18:20 adam
130 * Yet another bug fix in implementation of unread command.
132 * Revision 1.18 1996/02/12 16:07:54 adam
133 * Bug fix in new unread command.
135 * Revision 1.17 1996/02/12 15:56:11 adam
136 * New code command: unread.
138 * Revision 1.16 1996/01/17 14:57:51 adam
139 * Prototype changed for reader functions in extract/retrieve. File
140 * is identified by 'void *' instead of 'int.
142 * Revision 1.15 1996/01/08 19:15:47 adam
143 * New input filter that works!
145 * Revision 1.14 1996/01/08 09:10:38 adam
146 * Yet another complete rework on this module.
148 * Revision 1.13 1995/12/15 17:21:50 adam
149 * This version is able to set data.formatted_text in data1-nodes.
151 * Revision 1.12 1995/12/15 16:20:10 adam
152 * The filter files (*.flt) are read from the path given by data1_tabpath.
154 * Revision 1.11 1995/12/15 12:35:16 adam
157 * Revision 1.10 1995/12/15 10:35:36 adam
160 * Revision 1.9 1995/12/14 16:38:48 adam
161 * Completely new attempt to make regular expression parsing.
163 * Revision 1.8 1995/12/13 17:16:59 adam
166 * Revision 1.7 1995/12/13 16:51:58 adam
167 * Modified to set last_child in data1_nodes.
168 * Uses destroy handler to free up data text nodes.
170 * Revision 1.6 1995/12/13 13:45:37 quinn
171 * Changed data1 to use nmem.
173 * Revision 1.5 1995/12/11 09:12:52 adam
174 * The rec_get function returns NULL if record doesn't exist - will
175 * happen in the server if the result set records have been deleted since
176 * the creation of the set (i.e. the search).
177 * The server saves a result temporarily if it is 'volatile', i.e. the
178 * set is register dependent.
180 * Revision 1.4 1995/12/05 16:57:40 adam
181 * More work on regular patterns.
183 * Revision 1.3 1995/12/05 09:37:09 adam
184 * One malloc was renamed to xmalloc.
186 * Revision 1.2 1995/12/04 17:59:24 adam
187 * More work on regular expression conversion.
189 * Revision 1.1 1995/12/04 14:25:30 adam
190 * Started work on regular expression parsed input to structured records.
198 #include <yaz/tpath.h>
199 #include <zebrautl.h>
206 #if MAJOR_VERSION >= 8
207 #define HAVE_TCL_OBJECTS
213 #define F_WIN_EOF 2000000000
217 #define REGX_PATTERN 1
222 #define REGX_CONTEXT 6
232 struct lexRuleAction {
236 struct DFA *dfa; /* REGX_PATTERN */
239 struct regxCode *code; /* REGX_CODE */
241 struct lexRuleAction *next;
246 struct lexRuleAction *actionList;
250 struct lexRuleInfo info;
251 struct lexRule *next;
257 struct lexRule *rules;
258 struct lexRuleInfo **fastRule;
262 struct lexRuleAction *beginActionList;
263 struct lexRuleAction *endActionList;
264 struct lexRuleAction *initActionList;
265 struct lexContext *next;
268 struct lexConcatBuf {
275 struct lexContext *context;
277 struct lexContext **context_stack;
278 int context_stack_size;
279 int context_stack_top;
285 Tcl_Interp *tcl_interp;
288 void (*f_win_ef)(void *, off_t);
290 int f_win_start; /* first byte of buffer is this file offset */
291 int f_win_end; /* last byte of buffer is this offset - 1 */
292 int f_win_size; /* size of buffer */
293 char *f_win_buf; /* buffer itself */
294 int (*f_win_rf)(void *, char *, size_t);
295 off_t (*f_win_sf)(void *, off_t);
297 struct lexConcatBuf *concatBuf;
299 data1_node **d1_stack;
310 struct lexSpec *spec;
313 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
316 int i, r, off = start_pos - spec->f_win_start;
318 if (off >= 0 && end_pos <= spec->f_win_end)
320 *size = end_pos - start_pos;
321 return spec->f_win_buf + off;
323 if (off < 0 || start_pos >= spec->f_win_end)
325 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
326 spec->f_win_start = start_pos;
328 if (!spec->f_win_buf)
329 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
330 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
332 spec->f_win_end = spec->f_win_start + *size;
334 if (*size > end_pos - start_pos)
335 *size = end_pos - start_pos;
336 return spec->f_win_buf;
338 for (i = 0; i<spec->f_win_end - start_pos; i++)
339 spec->f_win_buf[i] = spec->f_win_buf[i + off];
340 r = (*spec->f_win_rf)(spec->f_win_fh,
342 spec->f_win_size - i);
343 spec->f_win_start = start_pos;
344 spec->f_win_end += r;
346 if (*size > end_pos - start_pos)
347 *size = end_pos - start_pos;
348 return spec->f_win_buf;
351 static int f_win_advance (struct lexSpec *spec, int *pos)
356 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
357 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
358 if (*pos == F_WIN_EOF)
360 buf = f_win_get (spec, *pos, *pos+1, &size);
370 static void regxCodeDel (struct regxCode **pp)
372 struct regxCode *p = *pp;
377 Tcl_DecrRefCount (p->tcl_obj);
385 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
389 p = (struct regxCode *) xmalloc (sizeof(*p));
390 p->str = (char *) xmalloc (len+1);
391 memcpy (p->str, buf, len);
394 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
396 Tcl_IncrRefCount (p->tcl_obj);
401 static struct DFA *lexSpecDFA (void)
406 dfa_parse_cmap_del (dfa, ' ');
407 dfa_parse_cmap_del (dfa, '\t');
408 dfa_parse_cmap_add (dfa, '/', 0);
412 static void actionListDel (struct lexRuleAction **rap)
414 struct lexRuleAction *ra1, *ra;
416 for (ra = *rap; ra; ra = ra1)
422 dfa_delete (&ra->u.pattern.dfa);
425 regxCodeDel (&ra->u.code);
433 static struct lexContext *lexContextCreate (const char *name)
435 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
437 p->name = xstrdup (name);
440 p->dfa = lexSpecDFA ();
443 p->beginActionList = NULL;
444 p->endActionList = NULL;
445 p->initActionList = NULL;
450 static void lexContextDestroy (struct lexContext *p)
452 struct lexRule *rp, *rp1;
454 dfa_delete (&p->dfa);
456 for (rp = p->rules; rp; rp = rp1)
459 actionListDel (&rp->info.actionList);
462 actionListDel (&p->beginActionList);
463 actionListDel (&p->endActionList);
464 actionListDel (&p->initActionList);
469 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
474 p = (struct lexSpec *) xmalloc (sizeof(*p));
475 p->name = (char *) xmalloc (strlen(name)+1);
476 strcpy (p->name, name);
483 p->context_stack_size = 100;
484 p->context_stack = (struct lexContext **)
485 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
489 p->concatBuf = (struct lexConcatBuf *)
490 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
491 for (i = 0; i < p->maxLevel; i++)
493 p->concatBuf[i].max = 0;
494 p->concatBuf[i].buf = 0;
496 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
501 static void lexSpecDestroy (struct lexSpec **pp)
504 struct lexContext *lt;
512 for (i = 0; i < p->maxLevel; i++)
513 xfree (p->concatBuf[i].buf);
514 xfree (p->concatBuf);
519 struct lexContext *lt_next = lt->next;
520 lexContextDestroy (lt);
525 Tcl_DeleteInterp (p->tcl_interp);
528 xfree (p->f_win_buf);
529 xfree (p->context_stack);
535 static int readParseToken (const char **cpp, int *len)
537 const char *cp = *cpp;
541 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
570 if (*cp >= 'a' && *cp <= 'z')
572 else if (*cp >= 'A' && *cp <= 'Z')
573 cmd[i] = *cp + 'a' - 'A';
576 if (i < (int) sizeof(cmd)-2)
583 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
585 while (*cp && *cp != ' ' && *cp != '\t' &&
586 *cp != '\n' && *cp != '\r')
592 if (!strcmp (cmd, "begin"))
594 else if (!strcmp (cmd, "end"))
596 else if (!strcmp (cmd, "body"))
598 else if (!strcmp (cmd, "context"))
600 else if (!strcmp (cmd, "init"))
604 logf (LOG_WARN, "bad command %s", cmd);
610 static int actionListMk (struct lexSpec *spec, const char *s,
611 struct lexRuleAction **ap)
617 while ((tok = readParseToken (&s, &len)))
625 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
627 regxCodeMk (&(*ap)->u.code, s, len);
631 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
633 (*ap)->u.pattern.body = bodyMark;
635 (*ap)->u.pattern.dfa = lexSpecDFA ();
637 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
642 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
645 dfa_mkstate ((*ap)->u.pattern.dfa);
649 logf (LOG_WARN, "cannot use BEGIN here");
652 logf (LOG_WARN, "cannot use INIT here");
655 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
665 int readOneSpec (struct lexSpec *spec, const char *s)
669 struct lexContext *lc;
671 tok = readParseToken (&s, &len);
672 if (tok == REGX_CONTEXT)
674 char context_name[32];
675 tok = readParseToken (&s, &len);
676 if (tok != REGX_CODE)
678 logf (LOG_WARN, "missing name after CONTEXT keyword");
683 memcpy (context_name, s, len);
684 context_name[len] = '\0';
685 lc = lexContextCreate (context_name);
686 lc->next = spec->context;
691 spec->context = lexContextCreate ("main");
696 actionListDel (&spec->context->beginActionList);
697 actionListMk (spec, s, &spec->context->beginActionList);
700 actionListDel (&spec->context->endActionList);
701 actionListMk (spec, s, &spec->context->endActionList);
704 actionListDel (&spec->context->initActionList);
705 actionListMk (spec, s, &spec->context->initActionList);
709 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
711 r = dfa_parse (spec->context->dfa, &s);
714 logf (LOG_WARN, "regular expression error. r=%d", r);
719 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
723 rp = (struct lexRule *) xmalloc (sizeof(*rp));
724 rp->info.no = spec->context->ruleNo++;
725 rp->next = spec->context->rules;
726 spec->context->rules = rp;
727 actionListMk (spec, s, &rp->info.actionList);
732 int readFileSpec (struct lexSpec *spec)
734 struct lexContext *lc;
735 int c, i, errors = 0;
741 if (spec->tcl_interp)
743 sprintf (fname, "%s.tflt", spec->name);
744 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
749 sprintf (fname, "%s.flt", spec->name);
750 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
754 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
757 logf (LOG_LOG, "reading regx filter %s", fname);
759 if (spec->tcl_interp)
760 logf (LOG_LOG, "Tcl enabled");
762 lineBuf = wrbuf_alloc();
767 wrbuf_rewind (lineBuf);
768 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
770 while (c != '\n' && c != EOF)
783 wrbuf_putc(lineBuf, c);
791 if (c != ' ' && c != '\t')
796 wrbuf_putc(lineBuf, '\0');
797 readOneSpec (spec, wrbuf_buf(lineBuf));
798 spec->lineNo += addLine;
802 wrbuf_free(lineBuf, 1);
807 debug_dfa_followpos = 1;
810 for (lc = spec->context; lc; lc = lc->next)
813 lc->fastRule = (struct lexRuleInfo **)
814 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
815 for (i = 0; i < lc->ruleNo; i++)
816 lc->fastRule[i] = NULL;
817 for (rp = lc->rules; rp; rp = rp->next)
818 lc->fastRule[rp->info.no] = &rp->info;
819 dfa_mkstate (lc->dfa);
828 static struct lexSpec *curLexSpec = NULL;
831 static void execData (struct lexSpec *spec,
832 const char *ebuf, int elen, int formatted_text)
834 struct data1_node *res, *parent;
837 if (elen == 0) /* shouldn't happen, but it does! */
841 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
842 ebuf, 15, ebuf + elen-15);
844 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
846 logf (LOG_DEBUG, "data (%d bytes)", elen);
849 if (spec->d1_level <= 1)
852 parent = spec->d1_stack[spec->d1_level -1];
855 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
856 org_len = res->u.data.len;
861 res = data1_mk_node (spec->dh, spec->m);
862 res->parent = parent;
863 res->which = DATA1N_data;
864 res->u.data.what = DATA1I_text;
866 res->u.data.formatted_text = formatted_text;
868 if (elen > DATA1_LOCALDATA)
869 res->u.data.data = nmem_malloc (spec->m, elen);
871 res->u.data.data = res->lbuf;
872 memcpy (res->u.data.data, ebuf, elen);
874 res->u.data.data = 0;
876 res->root = parent->root;
878 parent->last_child = res;
879 if (spec->d1_stack[spec->d1_level])
880 spec->d1_stack[spec->d1_level]->next = res;
883 spec->d1_stack[spec->d1_level] = res;
885 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
887 char *old_buf, *new_buf;
889 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
890 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
891 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
893 memcpy (new_buf, old_buf, org_len);
896 spec->concatBuf[spec->d1_level].buf = new_buf;
898 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
899 res->u.data.len += elen;
902 static void execDataP (struct lexSpec *spec,
903 const char *ebuf, int elen, int formatted_text)
905 execData (spec, ebuf, elen, formatted_text);
908 static void tagDataRelease (struct lexSpec *spec)
912 if ((res = spec->d1_stack[spec->d1_level]) &&
913 res->which == DATA1N_data &&
914 res->u.data.what == DATA1I_text)
916 assert (!res->u.data.data);
917 assert (res->u.data.len > 0);
918 if (res->u.data.len > DATA1_LOCALDATA)
919 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
921 res->u.data.data = res->lbuf;
922 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
927 static void variantBegin (struct lexSpec *spec,
928 const char *class_str, int class_len,
929 const char *type_str, int type_len,
930 const char *value_str, int value_len)
932 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
933 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
938 if (spec->d1_level == 0)
940 logf (LOG_WARN, "in variant begin. No record type defined");
943 if (class_len >= DATA1_MAX_SYMBOL)
944 class_len = DATA1_MAX_SYMBOL-1;
945 memcpy (tclass, class_str, class_len);
946 tclass[class_len] = '\0';
948 if (type_len >= DATA1_MAX_SYMBOL)
949 type_len = DATA1_MAX_SYMBOL-1;
950 memcpy (ttype, type_str, type_len);
951 ttype[type_len] = '\0';
954 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
959 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
963 if (parent->which != DATA1N_variant)
965 res = data1_mk_node (spec->dh, spec->m);
966 res->parent = parent;
967 res->which = DATA1N_variant;
968 res->u.variant.type = 0;
969 res->u.variant.value = 0;
970 res->root = parent->root;
972 parent->last_child = res;
973 if (spec->d1_stack[spec->d1_level])
975 tagDataRelease (spec);
976 spec->d1_stack[spec->d1_level]->next = res;
980 spec->d1_stack[spec->d1_level] = res;
981 spec->d1_stack[++(spec->d1_level)] = NULL;
983 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
984 if (spec->d1_stack[i]->u.variant.type == tp)
991 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
993 parent = spec->d1_stack[spec->d1_level-1];
994 res = data1_mk_node (spec->dh, spec->m);
995 res->parent = parent;
996 res->which = DATA1N_variant;
997 res->root = parent->root;
998 res->u.variant.type = tp;
1000 if (value_len >= DATA1_LOCALDATA)
1001 value_len =DATA1_LOCALDATA-1;
1002 memcpy (res->lbuf, value_str, value_len);
1003 res->lbuf[value_len] = '\0';
1005 res->u.variant.value = res->lbuf;
1007 parent->last_child = res;
1008 if (spec->d1_stack[spec->d1_level])
1010 tagDataRelease (spec);
1011 spec->d1_stack[spec->d1_level]->next = res;
1014 parent->child = res;
1015 spec->d1_stack[spec->d1_level] = res;
1016 spec->d1_stack[++(spec->d1_level)] = NULL;
1019 static void tagStrip (const char **tag, int *len)
1023 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1026 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1032 static void tagBegin (struct lexSpec *spec,
1033 const char *tag, int len)
1035 struct data1_node *parent;
1036 data1_element *elem = NULL;
1039 data1_element *e = NULL;
1042 if (spec->d1_level == 0)
1044 logf (LOG_WARN, "in element begin. No record type defined");
1047 tagStrip (&tag, &len);
1049 parent = spec->d1_stack[spec->d1_level -1];
1050 partag = get_parent_tag(spec->dh, parent);
1052 res = data1_mk_node (spec->dh, spec->m);
1053 res->parent = parent;
1054 res->which = DATA1N_tag;
1055 res->u.tag.get_bytes = -1;
1057 if (len >= DATA1_LOCALDATA)
1058 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1060 res->u.tag.tag = res->lbuf;
1062 memcpy (res->u.tag.tag, tag, len);
1063 res->u.tag.tag[len] = '\0';
1066 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1068 if (parent->which == DATA1N_variant)
1071 if (!(e = partag->u.tag.element))
1074 elem = data1_getelementbytagname (spec->dh,
1075 spec->d1_stack[0]->u.root.absyn,
1077 res->u.tag.element = elem;
1078 res->u.tag.node_selected = 0;
1079 res->u.tag.make_variantlist = 0;
1080 res->u.tag.no_data_requested = 0;
1081 res->root = parent->root;
1083 parent->last_child = res;
1084 if (spec->d1_stack[spec->d1_level])
1086 tagDataRelease (spec);
1087 spec->d1_stack[spec->d1_level]->next = res;
1090 parent->child = res;
1091 spec->d1_stack[spec->d1_level] = res;
1092 spec->d1_stack[++(spec->d1_level)] = NULL;
1095 static void tagEnd (struct lexSpec *spec, int min_level,
1096 const char *tag, int len)
1098 tagStrip (&tag, &len);
1099 while (spec->d1_level > min_level)
1101 tagDataRelease (spec);
1103 if (spec->d1_level == 0)
1105 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1107 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1109 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1113 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1118 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1121 struct DFA_state *state = dfa->states[0];
1124 unsigned char c_prev = 0;
1125 int ptr = *pptr; /* current pointer */
1126 int start_ptr = *pptr; /* first char of match */
1127 int last_ptr = 0; /* last char of match */
1128 int last_rule = 0; /* rule number of current match */
1133 c = f_win_advance (spec, &ptr);
1134 if (ptr == F_WIN_EOF)
1151 *mptr = start_ptr; /* match starts here */
1152 *pptr = last_ptr; /* match end here (+1) */
1155 state = dfa->states[0];
1160 else if (c >= t->ch[0] && c <= t->ch[1])
1162 state = dfa->states[t->to];
1167 last_rule = state->rule_no;
1172 last_rule = state->rule_nno;
1184 static int execTok (struct lexSpec *spec, const char **src,
1185 const char **tokBuf, int *tokLen)
1187 const char *s = *src;
1189 while (*s == ' ' || *s == '\t')
1193 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1197 while (*s >= '0' && *s <= '9')
1198 n = n*10 + (*s++ -'0');
1199 if (spec->arg_no == 0)
1206 if (n >= spec->arg_no)
1208 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1212 else if (*s == '\"')
1215 while (*s && *s != '\"')
1217 *tokLen = s - *tokBuf;
1222 else if (*s == '\n' || *s == ';')
1230 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1233 *tokLen = s - *tokBuf;
1240 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1243 *tokLen = s - *tokBuf;
1249 static char *regxStrz (const char *src, int len, char *str)
1253 memcpy (str, src, len);
1259 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1260 int argc, char **argv)
1262 struct lexSpec *spec = (struct lexSpec *) clientData;
1265 if (!strcmp(argv[1], "record") && argc == 3)
1267 char *absynName = argv[2];
1271 logf (LOG_DEBUG, "begin record %s", absynName);
1273 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1274 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1279 res = data1_mk_node (spec->dh, spec->m);
1280 res->which = DATA1N_root;
1281 res->u.root.type = absynName;
1282 res->u.root.absyn = absyn;
1285 spec->d1_stack[spec->d1_level] = res;
1286 spec->d1_stack[++(spec->d1_level)] = NULL;
1289 else if (!strcmp(argv[1], "element") && argc == 3)
1291 tagBegin (spec, argv[2], strlen(argv[2]));
1293 else if (!strcmp (argv[1], "variant") && argc == 5)
1295 variantBegin (spec, argv[2], strlen(argv[2]),
1296 argv[3], strlen(argv[3]),
1297 argv[4], strlen(argv[4]));
1299 else if (!strcmp (argv[1], "context") && argc == 3)
1301 struct lexContext *lc = spec->context;
1303 logf (LOG_DEBUG, "begin context %s",argv[2]);
1305 while (lc && strcmp (argv[2], lc->name))
1309 spec->context_stack[++(spec->context_stack_top)] = lc;
1312 logf (LOG_WARN, "unknown context %s", argv[2]);
1319 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1320 int argc, char **argv)
1322 struct lexSpec *spec = (struct lexSpec *) clientData;
1326 if (!strcmp (argv[1], "record"))
1328 while (spec->d1_level)
1330 tagDataRelease (spec);
1334 logf (LOG_DEBUG, "end record");
1336 spec->stop_flag = 1;
1338 else if (!strcmp (argv[1], "element"))
1342 if (argc >= 3 && !strcmp(argv[2], "-record"))
1351 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1352 if (spec->d1_level == 0)
1355 logf (LOG_DEBUG, "end element end records");
1357 spec->stop_flag = 1;
1360 else if (!strcmp (argv[1], "context"))
1363 logf (LOG_DEBUG, "end context");
1365 if (spec->context_stack_top)
1366 (spec->context_stack_top)--;
1373 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1374 int argc, char **argv)
1378 const char *element = 0;
1379 struct lexSpec *spec = (struct lexSpec *) clientData;
1383 if (!strcmp("-text", argv[argi]))
1388 else if (!strcmp("-element", argv[argi]))
1392 element = argv[argi++];
1398 tagBegin (spec, element, strlen(element));
1402 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1406 tagEnd (spec, 1, NULL, 0);
1410 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1411 int argc, char **argv)
1413 struct lexSpec *spec = (struct lexSpec *) clientData;
1420 if (!strcmp("-offset", argv[argi]))
1425 offset = atoi(argv[argi]);
1434 no = atoi(argv[argi]);
1435 if (no >= spec->arg_no)
1436 no = spec->arg_no - 1;
1437 spec->ptr = spec->arg_start[no] + offset;
1441 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1445 for (i = 0; i < spec->arg_no; i++)
1447 char var_name[10], *var_buf;
1450 sprintf (var_name, "%d", i);
1451 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1455 ch = var_buf[var_len];
1456 var_buf[var_len] = '\0';
1457 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1458 var_buf[var_len] = ch;
1461 #if HAVE_TCL_OBJECTS
1462 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1464 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1468 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1469 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1470 spec->tcl_interp->errorLine,
1471 spec->tcl_interp->result,
1472 err ? err : "[NO ERRORINFO]");
1478 static void execCode (struct lexSpec *spec, struct regxCode *code)
1480 const char *s = code->str;
1482 const char *cmd_str;
1484 r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 r = execTok (spec, &s, &cmd_str, &cmd_len);
1494 p = regxStrz (cmd_str, cmd_len, ptmp);
1495 if (!strcmp (p, "begin"))
1497 r = execTok (spec, &s, &cmd_str, &cmd_len);
1500 logf (LOG_WARN, "missing keyword after 'begin'");
1503 p = regxStrz (cmd_str, cmd_len, ptmp);
1504 if (!strcmp (p, "record"))
1506 r = execTok (spec, &s, &cmd_str, &cmd_len);
1509 if (spec->d1_level == 0)
1511 static char absynName[64];
1516 memcpy (absynName, cmd_str, cmd_len);
1517 absynName[cmd_len] = '\0';
1520 logf (LOG_DEBUG, "begin record %s", absynName);
1522 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1523 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1528 res = data1_mk_node (spec->dh, spec->m);
1529 res->which = DATA1N_root;
1530 res->u.root.type = absynName;
1531 res->u.root.absyn = absyn;
1534 spec->d1_stack[spec->d1_level] = res;
1535 spec->d1_stack[++(spec->d1_level)] = NULL;
1538 r = execTok (spec, &s, &cmd_str, &cmd_len);
1540 else if (!strcmp (p, "element"))
1542 r = execTok (spec, &s, &cmd_str, &cmd_len);
1545 tagBegin (spec, cmd_str, cmd_len);
1546 r = execTok (spec, &s, &cmd_str, &cmd_len);
1548 else if (!strcmp (p, "variant"))
1551 const char *class_str = NULL;
1553 const char *type_str = NULL;
1555 const char *value_str = NULL;
1556 r = execTok (spec, &s, &cmd_str, &cmd_len);
1559 class_str = cmd_str;
1560 class_len = cmd_len;
1561 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 r = execTok (spec, &s, &cmd_str, &cmd_len);
1570 value_str = cmd_str;
1571 value_len = cmd_len;
1573 variantBegin (spec, class_str, class_len,
1574 type_str, type_len, value_str, value_len);
1577 r = execTok (spec, &s, &cmd_str, &cmd_len);
1579 else if (!strcmp (p, "context"))
1583 struct lexContext *lc = spec->context;
1584 r = execTok (spec, &s, &cmd_str, &cmd_len);
1585 p = regxStrz (cmd_str, cmd_len, ptmp);
1587 logf (LOG_DEBUG, "begin context %s", p);
1589 while (lc && strcmp (p, lc->name))
1592 spec->context_stack[++(spec->context_stack_top)] = lc;
1594 logf (LOG_WARN, "unknown context %s", p);
1597 r = execTok (spec, &s, &cmd_str, &cmd_len);
1601 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1604 else if (!strcmp (p, "end"))
1606 r = execTok (spec, &s, &cmd_str, &cmd_len);
1609 logf (LOG_WARN, "missing keyword after 'end'");
1612 p = regxStrz (cmd_str, cmd_len, ptmp);
1613 if (!strcmp (p, "record"))
1615 while (spec->d1_level)
1617 tagDataRelease (spec);
1620 r = execTok (spec, &s, &cmd_str, &cmd_len);
1622 logf (LOG_DEBUG, "end record");
1624 spec->stop_flag = 1;
1626 else if (!strcmp (p, "element"))
1629 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1631 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1636 tagEnd (spec, min_level, cmd_str, cmd_len);
1637 r = execTok (spec, &s, &cmd_str, &cmd_len);
1640 tagEnd (spec, min_level, NULL, 0);
1641 if (spec->d1_level == 0)
1644 logf (LOG_DEBUG, "end element end records");
1646 spec->stop_flag = 1;
1650 else if (!strcmp (p, "context"))
1653 logf (LOG_DEBUG, "end context");
1655 if (spec->context_stack_top)
1656 (spec->context_stack_top)--;
1657 r = execTok (spec, &s, &cmd_str, &cmd_len);
1660 logf (LOG_WARN, "bad keyword '%s' after end", p);
1662 else if (!strcmp (p, "data"))
1666 const char *element_str = NULL;
1668 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1670 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1672 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1674 r = execTok (spec, &s, &element_str, &element_len);
1679 logf (LOG_WARN, "bad data option: %.*s",
1684 logf (LOG_WARN, "missing data item after data");
1688 tagBegin (spec, element_str, element_len);
1691 execData (spec, cmd_str, cmd_len,textFlag);
1692 r = execTok (spec, &s, &cmd_str, &cmd_len);
1695 tagEnd (spec, 1, NULL, 0);
1697 else if (!strcmp (p, "unread"))
1700 r = execTok (spec, &s, &cmd_str, &cmd_len);
1701 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1703 r = execTok (spec, &s, &cmd_str, &cmd_len);
1706 logf (LOG_WARN, "missing number after -offset");
1709 p = regxStrz (cmd_str, cmd_len, ptmp);
1711 r = execTok (spec, &s, &cmd_str, &cmd_len);
1717 logf (LOG_WARN, "missing index after unread command");
1720 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1722 logf (LOG_WARN, "bad index after unread command");
1727 no = *cmd_str - '0';
1728 if (no >= spec->arg_no)
1729 no = spec->arg_no - 1;
1730 spec->ptr = spec->arg_start[no] + offset;
1732 r = execTok (spec, &s, &cmd_str, &cmd_len);
1734 else if (!strcmp (p, "context"))
1738 struct lexContext *lc = spec->context;
1739 r = execTok (spec, &s, &cmd_str, &cmd_len);
1740 p = regxStrz (cmd_str, cmd_len, ptmp);
1742 while (lc && strcmp (p, lc->name))
1745 spec->context_stack[spec->context_stack_top] = lc;
1747 logf (LOG_WARN, "unknown context %s", p);
1750 r = execTok (spec, &s, &cmd_str, &cmd_len);
1754 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1755 r = execTok (spec, &s, &cmd_str, &cmd_len);
1760 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1762 r = execTok (spec, &s, &cmd_str, &cmd_len);
1769 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1770 int start_ptr, int *pptr)
1779 arg_start[0] = start_ptr;
1781 spec->arg_start = arg_start;
1782 spec->arg_end = arg_end;
1789 if (ap->u.pattern.body)
1791 arg_start[arg_no] = *pptr;
1792 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1794 arg_end[arg_no] = F_WIN_EOF;
1796 arg_start[arg_no] = F_WIN_EOF;
1797 arg_end[arg_no] = F_WIN_EOF;
1802 arg_end[arg_no] = sptr;
1804 arg_start[arg_no] = sptr;
1805 arg_end[arg_no] = *pptr;
1810 arg_start[arg_no] = *pptr;
1811 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1813 if (sptr != arg_start[arg_no])
1815 arg_end[arg_no] = *pptr;
1820 spec->arg_no = arg_no;
1823 if (spec->tcl_interp)
1824 execTcl(spec, ap->u.code);
1826 execCode (spec, ap->u.code);
1828 execCode (spec, ap->u.code);
1831 if (spec->stop_flag)
1835 arg_start[arg_no] = *pptr;
1836 arg_end[arg_no] = F_WIN_EOF;
1845 static int execRule (struct lexSpec *spec, struct lexContext *context,
1846 int ruleNo, int start_ptr, int *pptr)
1849 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1851 return execAction (spec, context->fastRule[ruleNo]->actionList,
1855 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1857 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1858 struct DFA_state *state = context->dfa->states[0];
1861 unsigned char c_prev = '\n';
1863 int last_rule = 0; /* rule number of current match */
1864 int last_ptr = *ptr; /* last char of match */
1865 int start_ptr = *ptr; /* first char of match */
1866 int skip_ptr = *ptr; /* first char of run */
1870 c = f_win_advance (spec, ptr);
1871 if (*ptr == F_WIN_EOF)
1873 /* end of file met */
1876 /* there was a match */
1877 if (skip_ptr < start_ptr)
1879 /* deal with chars that didn't match */
1882 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1883 execDataP (spec, buf, size, 0);
1885 /* restore pointer */
1888 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1890 /* restore skip pointer */
1894 else if (skip_ptr < *ptr)
1896 /* deal with chars that didn't match */
1899 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1900 execDataP (spec, buf, size, 0);
1902 if (*ptr == F_WIN_EOF)
1909 { /* no transition for character c ... */
1912 if (skip_ptr < start_ptr)
1914 /* deal with chars that didn't match */
1917 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1918 execDataP (spec, buf, size, 0);
1920 /* restore pointer */
1922 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1924 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1927 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1929 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1933 context = spec->context_stack[spec->context_stack_top];
1936 last_ptr = start_ptr = *ptr;
1940 c_prev = f_win_advance (spec, &start_ptr);
1945 c_prev = f_win_advance (spec, &start_ptr);
1948 state = context->dfa->states[0];
1951 else if (c >= t->ch[0] && c <= t->ch[1])
1952 { /* transition ... */
1953 state = context->dfa->states[t->to];
1958 last_rule = state->rule_no;
1961 else if (state->rule_nno)
1963 last_rule = state->rule_nno;
1975 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1976 const char *context_name)
1978 struct lexContext *lt = spec->context;
1981 spec->stop_flag = 0;
1983 spec->context_stack_top = 0;
1986 if (!strcmp (lt->name, context_name))
1992 logf (LOG_WARN, "cannot find context %s", context_name);
1995 spec->context_stack[spec->context_stack_top] = lt;
1996 spec->d1_stack[spec->d1_level] = NULL;
2001 execAction (spec, lt->initActionList, ptr, &ptr);
2004 execAction (spec, lt->beginActionList, ptr, &ptr);
2005 lexNode (spec, &ptr);
2006 while (spec->d1_level)
2008 tagDataRelease (spec);
2011 execAction (spec, lt->endActionList, ptr, &ptr);
2012 return spec->d1_stack[0];
2015 void grs_destroy(void *clientData)
2017 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2020 lexSpecDestroy(&specs->spec);
2025 void *grs_init(void)
2027 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2032 data1_node *grs_read_regx (struct grs_read_info *p)
2035 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2036 struct lexSpec **curLexSpec = &specs->spec;
2039 logf (LOG_DEBUG, "grs_read_regx");
2041 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2044 lexSpecDestroy (curLexSpec);
2045 *curLexSpec = lexSpecCreate (p->type, p->dh);
2046 res = readFileSpec (*curLexSpec);
2049 lexSpecDestroy (curLexSpec);
2053 (*curLexSpec)->dh = p->dh;
2056 (*curLexSpec)->f_win_start = 0;
2057 (*curLexSpec)->f_win_end = 0;
2058 (*curLexSpec)->f_win_rf = p->readf;
2059 (*curLexSpec)->f_win_sf = p->seekf;
2060 (*curLexSpec)->f_win_fh = p->fh;
2061 (*curLexSpec)->f_win_ef = p->endf;
2062 (*curLexSpec)->f_win_size = 500000;
2064 (*curLexSpec)->m = p->mem;
2065 return lexRoot (*curLexSpec, p->offset, "main");
2068 static struct recTypeGrs regx_type = {
2075 RecTypeGrs recTypeGrs_regx = ®x_type;
2078 data1_node *grs_read_tcl (struct grs_read_info *p)
2081 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2082 struct lexSpec **curLexSpec = &specs->spec;
2085 logf (LOG_DEBUG, "grs_read_tcl");
2087 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2089 Tcl_Interp *tcl_interp;
2091 lexSpecDestroy (curLexSpec);
2092 *curLexSpec = lexSpecCreate (p->type, p->dh);
2093 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2094 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2095 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2096 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2097 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2099 res = readFileSpec (*curLexSpec);
2102 lexSpecDestroy (curLexSpec);
2106 (*curLexSpec)->dh = p->dh;
2109 (*curLexSpec)->f_win_start = 0;
2110 (*curLexSpec)->f_win_end = 0;
2111 (*curLexSpec)->f_win_rf = p->readf;
2112 (*curLexSpec)->f_win_sf = p->seekf;
2113 (*curLexSpec)->f_win_fh = p->fh;
2114 (*curLexSpec)->f_win_ef = p->endf;
2115 (*curLexSpec)->f_win_size = 500000;
2117 (*curLexSpec)->m = p->mem;
2118 return lexRoot (*curLexSpec, p->offset, "main");
2121 static struct recTypeGrs tcl_type = {
2128 RecTypeGrs recTypeGrs_tcl = &tcl_type;