2 * Copyright (C) 1994-1997, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.17 1997-09-18 08:59:17 adam
8 * Extra generic handle for the character mapping routines.
10 * Revision 1.16 1997/09/05 15:29:57 adam
11 * Changed prototype for chr_map_input - added const.
12 * Added support for C++, headers uses extern "C" for public definitions.
14 * Revision 1.15 1997/02/10 10:19:20 adam
15 * Added facility for open character sets, eg [a-].
17 * Revision 1.14 1996/10/29 13:57:22 adam
18 * Include of zebrautl.h instead of alexutil.h.
20 * Revision 1.13 1996/06/17 14:24:08 adam
21 * Bug fix: read_charset didn't handle character mapping.
23 * Revision 1.12 1996/06/04 10:20:02 adam
24 * Added support for character mapping.
26 * Revision 1.11 1996/01/08 19:15:24 adam
27 * Allow single $ in expressions.
29 * Revision 1.10 1996/01/08 09:09:17 adam
30 * Function dfa_parse got 'const' string argument.
31 * New functions to define char mappings made public.
33 * Revision 1.9 1995/12/06 12:24:58 adam
34 * Removed verbatim mode code.
36 * Revision 1.8 1995/12/06 09:09:58 adam
37 * Work on left and right anchors.
39 * Revision 1.7 1995/11/27 09:23:02 adam
40 * New berbatim hook in regular expressions. "[]n ..".
42 * Revision 1.6 1995/10/16 09:31:25 adam
45 * Revision 1.5 1995/10/02 15:17:58 adam
46 * Bug fix in dfa_delete.
48 * Revision 1.4 1995/09/28 09:18:52 adam
49 * Removed various preprocessor defines.
51 * Revision 1.3 1995/09/04 12:33:26 adam
52 * Various cleanup. YAZ util used instead.
54 * Revision 1.2 1995/01/25 11:30:50 adam
55 * Simple error reporting when parsing regular expressions.
56 * Memory usage reduced.
58 * Revision 1.1 1995/01/24 16:02:52 adam
59 * New private header file in dfa module (dfap.h).
60 * Module no longer uses yacc to parse regular expressions.
74 #define DFA_OPEN_RANGE 1
84 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
85 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
86 /* a character in range [ch[0]..ch[1]] */
87 /* otherwise ch[0] represents */
88 /* accepting no (-acceptno) */
90 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
91 unsigned nullable : 1;
92 Set firstpos; /* first positions */
93 Set lastpos; /* last positions */
96 struct Tblock { /* Tnode bucket (block) */
97 struct Tblock *next; /* pointer to next bucket */
98 struct Tnode *tarray; /* array of nodes in bucket */
102 #define STATE_HASH 199
103 #define POSET_CHUNK 100
105 int debug_dfa_trav = 0;
106 int debug_dfa_tran = 0;
107 int debug_dfa_followpos = 0;
110 static struct DFA_parse *parse_info = NULL;
113 static int inside_string;
114 static const unsigned char *expr_ptr;
115 static struct Tnode **posar;
117 static SetType poset;
118 static Set *followpos;
120 static struct Tnode *mk_Tnode (void);
121 static struct Tnode *mk_Tnode_cset (BSet charset);
122 static void term_Tnode (void);
125 del_followpos (void),
128 mk_dfa_tran (struct DFA_states *dfas),
129 add_follow (Set lastpos, Set firstpos),
130 dfa_trav (struct Tnode *n),
131 init_followpos (void),
132 pr_tran (struct DFA_states *dfas),
133 pr_verbose (struct DFA_states *dfas),
143 *str_char (unsigned c);
159 static int lookahead;
160 static unsigned look_ch;
161 static BSet look_chars;
163 static struct Tnode *expr_1 (void),
168 static struct Tnode *expr_1 (void)
170 struct Tnode *t1, *t2, *tn;
172 if (!(t1 = expr_2 ()))
174 while (lookahead == L_ALT)
177 if (!(t2 = expr_2 ()))
189 static struct Tnode *expr_2 (void)
191 struct Tnode *t1, *t2, *tn;
193 if (!(t1 = expr_3 ()))
195 while (lookahead == L_WILD ||
196 lookahead == L_ANYZ ||
197 lookahead == L_ANY ||
199 lookahead == L_CHAR ||
200 lookahead == L_CHARS)
202 if (!(t2 = expr_3 ()))
214 static struct Tnode *expr_3 (void)
216 struct Tnode *t1, *tn;
218 if (!(t1 = expr_4 ()))
220 if (lookahead == L_CLOS0)
228 else if (lookahead == L_CLOS1)
236 else if (lookahead == L_QUEST)
242 tn->u.p[1] = mk_Tnode();
243 tn->u.p[1]->pos = EPSILON;
249 static struct Tnode *expr_4 (void)
257 if (!(t1 = expr_1 ()))
259 if (lookahead == L_RP)
266 t1->pos = ++parse_info->position;
267 t1->u.ch[1] = t1->u.ch[0] = look_ch;
271 t1 = mk_Tnode_cset (look_chars);
275 t1 = mk_Tnode_cset (parse_info->anyset);
281 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
282 t1->u.p[1] = mk_Tnode();
283 t1->u.p[1]->pos = EPSILON;
289 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
297 static void do_parse (struct DFA_parse *dfap, const char **s,
300 int start_anchor_flag = 0;
301 struct Tnode *t1, *t2, *tn;
305 expr_ptr = (const unsigned char *) *s;
309 if (lookahead == L_START)
311 start_anchor_flag = 1;
314 if (lookahead == L_END)
317 t1->pos = ++parse_info->position;
318 t1->u.ch[1] = t1->u.ch[0] = '\n';
324 if (t1 && lookahead == L_END)
327 t2->pos = ++parse_info->position;
328 t2->u.ch[1] = t2->u.ch[0] = '\n';
339 if (t1 && lookahead == 0)
342 t2->pos = ++parse_info->position;
343 t2->u.ch[0] = -(++parse_info->rule);
344 t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
355 if (lookahead == L_RP)
356 err_code = DFA_ERR_RP;
357 else if (lookahead == L_LP)
358 err_code = DFA_ERR_LP;
360 err_code = DFA_ERR_SYNTAX;
363 *s = (const char *) expr_ptr;
366 static int nextchar (int *esc)
369 if (*expr_ptr == '\0')
371 else if (*expr_ptr != '\\')
400 static int nextchar_set (int *esc)
402 if (*expr_ptr == ' ')
407 return nextchar (esc);
410 static int read_charset (void)
412 int i, ch0, ch1, esc0, esc1, cc = 0;
413 look_chars = mk_BSet (&parse_info->charset);
414 res_BSet (parse_info->charset, look_chars);
416 ch0 = nextchar_set (&esc0);
417 if (!esc0 && ch0 == '^')
420 ch0 = nextchar_set (&esc0);
424 if (!esc0 && ch0 == ']')
426 if (parse_info->cmap)
430 const char *mcp = mapfrom;
432 mapto = (*parse_info->cmap)(parse_info->cmap_data, &mcp, 1);
436 add_BSet (parse_info->charset, look_chars, ch0);
437 ch1 = nextchar_set (&esc1);
438 if (!esc1 && ch1 == '-')
441 if ((ch1 = nextchar_set (&esc1)) == 0)
444 if (!esc1 && ch1 == ']')
450 if (!esc1 && ch1 == ']')
452 add_BSet (parse_info->charset, look_chars, '-');
456 if (!open_range && parse_info->cmap)
460 const char *mcp = mapfrom;
462 mapto = (*parse_info->cmap) (parse_info->cmap_data, &mcp, 1);
466 for (i=ch0; ++i<=ch1;)
467 add_BSet (parse_info->charset, look_chars, i);
469 ch0 = nextchar_set (&esc0);
480 com_BSet (parse_info->charset, look_chars);
484 static int map_l_char (void)
487 const char *cp0 = (const char *) (expr_ptr-1);
488 int i = 0, len = strlen(cp0);
490 if (cp0[0] == 1 && cp0[1])
496 if (!parse_info->cmap)
499 mapto = (*parse_info->cmap) (parse_info->cmap_data, &cp0, len);
502 expr_ptr = (const unsigned char *) cp0;
503 look_ch = mapto[i][0];
504 logf (LOG_DEBUG, "map from %c to %d", expr_ptr[-1], look_ch);
508 static int lex_sub(void)
511 while ((look_ch = nextchar (&esc)) != 0)
515 return map_l_char ();
516 inside_string = !inside_string;
518 else if (esc || inside_string)
519 return map_l_char ();
520 else if (look_ch == '[')
521 return read_charset();
525 for (cc = parse_info->charMap; *cc; cc += 2)
532 return map_l_char ();
537 static void lex (void)
539 lookahead = lex_sub ();
542 static const char *str_char (unsigned c)
559 sprintf (s+1, "x%02x", c);
581 static void out_char (int c)
583 printf ("%s", str_char (c));
586 static void term_Tnode (void)
588 struct Tblock *t, *t1;
589 for (t = parse_info->start; (t1 = t);)
597 static struct Tnode *mk_Tnode (void)
601 if (parse_info->use_Tnode == parse_info->max_Tnode)
603 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
604 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
607 if (parse_info->use_Tnode == 0)
608 parse_info->start = tnew;
610 parse_info->end->next = tnew;
611 parse_info->end = tnew;
613 parse_info->max_Tnode += TADD;
615 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
618 struct Tnode *mk_Tnode_cset (BSet charset)
620 struct Tnode *tn1, *tn0 = mk_Tnode();
621 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
627 tn0->pos = ++parse_info->position;
628 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
630 tn0->u.ch[1] = parse_info->charset->size;
633 tn0->u.ch[1] = ch1-1;
634 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
640 tn1 = tn0->u.p[1] = mk_Tnode();
642 tn1->pos = ++(parse_info->position);
643 if ((ch1 = travi_BSet (parse_info->charset, charset,
646 tn1->u.ch[1] = parse_info->charset->size;
649 tn1->u.ch[1] = ch1-1;
656 static void del_followpos (void)
661 static void init_pos (void)
663 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
664 * (1+parse_info->position));
667 static void del_pos (void)
672 static void add_follow (Set lastpos, Set firstpos)
676 followpos[lastpos->value] =
677 union_Set (poset, followpos[lastpos->value], firstpos);
678 lastpos = lastpos->next;
682 static void dfa_trav (struct Tnode *n)
687 dfa_trav (n->u.p[0]);
688 dfa_trav (n->u.p[1]);
689 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
690 n->firstpos = mk_Set (poset);
691 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
692 if (n->u.p[0]->nullable)
693 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
694 n->lastpos = mk_Set (poset);
695 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
696 if (n->u.p[1]->nullable)
697 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
699 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
701 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
702 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
703 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
704 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
709 dfa_trav (n->u.p[0]);
710 dfa_trav (n->u.p[1]);
711 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
713 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
714 n->u.p[1]->firstpos);
715 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
717 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
718 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
719 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
720 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
725 dfa_trav (n->u.p[0]);
726 n->nullable = n->u.p[0]->nullable;
727 n->firstpos = n->u.p[0]->firstpos;
728 n->lastpos = n->u.p[0]->lastpos;
729 add_follow (n->lastpos, n->firstpos);
734 dfa_trav (n->u.p[0]);
736 n->firstpos = n->u.p[0]->firstpos;
737 n->lastpos = n->u.p[0]->lastpos;
738 add_follow (n->lastpos, n->firstpos);
744 n->lastpos = mk_Set (poset);
745 n->firstpos = mk_Set (poset);
752 n->firstpos = mk_Set (poset);
753 n->firstpos = add_Set (poset, n->firstpos, n->pos);
754 n->lastpos = mk_Set (poset);
755 n->lastpos = add_Set (poset, n->lastpos, n->pos);
758 printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
759 else if (n->u.ch[1] > n->u.ch[0])
762 out_char (n->u.ch[0]);
763 if (n->u.ch[1] > n->u.ch[0]+1)
765 out_char (n->u.ch[1]);
769 out_char (n->u.ch[0]);
773 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
774 printf (" firstpos :");
775 pr_Set (poset, n->firstpos);
776 printf (" lastpos :");
777 pr_Set (poset, n->lastpos);
781 static void init_followpos (void)
785 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
786 for (i = parse_info->position+1; --i >= 0; fa++)
787 *fa = mk_Set (poset);
790 static void mk_dfa_tran (struct DFA_states *dfas)
797 struct DFA_state *dfa_from, *dfa_to;
800 max_char = parse_info->charset->size;
801 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
803 tran_set = cp_Set (poset, parse_info->root->firstpos);
804 i = add_DFA_state (dfas, &tran_set, &dfa_from);
806 dfa_from->rule_no = 0;
807 while ((dfa_from = get_DFA_state (dfas)))
811 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
812 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
813 *pos_i++ = tran_set->value;
818 c = posar[tran_set->value]->u.ch[1];
823 dfa_from->rule_no = -i;
824 dfa_from->rule_nno = -j;
826 for (char_1 = 0; char_1 <= max_char; char_1++)
829 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
830 if (posar[i]->u.ch[1] >= char_1
831 && (c=posar[i]->u.ch[0]) < char_0)
837 if (char_0 > max_char)
842 tran_set = mk_Set (poset);
843 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
845 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
846 char_1 = c - 1; /* forward chunk */
847 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
848 char_1 = c; /* backward chunk */
849 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
850 tran_set = union_Set (poset, tran_set, followpos[i]);
854 add_DFA_state (dfas, &tran_set, &dfa_to);
855 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
860 sort_DFA_states (dfas);
863 static void pr_tran (struct DFA_states *dfas)
866 struct DFA_tran *tran;
867 int prev_no, i, c, no;
869 for (no=0; no < dfas->no; ++no)
871 s = dfas->sortarray[no];
872 assert (s->no == no);
873 printf ("DFA state %d", no);
875 printf ("#%d", s->rule_no);
877 printf (" #%d", s->rule_nno);
880 pr_Set (poset, s->set);
882 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
884 if (prev_no != tran->to)
888 printf (" goto %d on [", tran->to);
891 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
892 printf ("%s", str_char(c));
900 static void pr_verbose (struct DFA_states *dfas)
904 printf ("%d/%d tree nodes used, %d bytes each\n",
905 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
906 k = inf_BSetHandle (parse_info->charset, &i, &j);
907 printf ("%ld/%ld character sets, %d bytes each\n",
908 i/k, j/k, k*sizeof(BSetWord));
909 k = inf_SetType (poset, &i, &j);
910 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
911 printf ("%d DFA states\n", dfas->no);
914 static void pr_followpos (void)
917 printf ("\nfollowsets:\n");
918 for (i=1; i <= parse_info->position; i++)
921 pr_Set (poset, followpos[i]);
924 if (posar[i]->u.ch[0] < 0)
925 printf ("#%d", -posar[i]->u.ch[0]);
926 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
929 out_char (posar[i]->u.ch[0]);
930 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
932 out_char (posar[i]->u.ch[1]);
936 out_char (posar[i]->u.ch[0]);
942 void dfa_parse_cmap_clean (struct DFA *d)
944 struct DFA_parse *dfa = d->parse_info;
949 dfa->charMapSize = 7;
950 dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
955 void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
957 struct DFA_parse *dfa = d->parse_info;
962 for (cp = cmap; *cp; cp += 2)
964 size = cp - cmap + 1;
965 if (size > dfa->charMapSize)
968 ifree (dfa->charMap);
969 dfa->charMapSize = size;
970 dfa->charMap = imalloc (size * sizeof(*dfa->charMap));
972 memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
975 void dfa_parse_cmap_del (struct DFA *d, int from)
977 struct DFA_parse *dfa = d->parse_info;
981 for (cc = dfa->charMap; *cc; cc += 2)
984 while ((cc[0] = cc[2]))
993 void dfa_parse_cmap_add (struct DFA *d, int from, int to)
995 struct DFA_parse *dfa = d->parse_info;
1000 for (cc = dfa->charMap; *cc; cc += 2)
1006 indx = cc - dfa->charMap;
1007 size = dfa->charMapSize;
1010 int *cn = imalloc ((size+16) * sizeof(*dfa->charMap));
1011 memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
1012 ifree (dfa->charMap);
1014 dfa->charMapSize = size+16;
1016 dfa->charMap[indx] = from;
1017 dfa->charMap[indx+1] = to;
1018 dfa->charMap[indx+2] = 0;
1021 void dfa_parse_cmap_thompson (struct DFA *d)
1023 static int thompson_chars[] =
1039 dfa_parse_cmap_new (d, thompson_chars);
1042 static struct DFA_parse *dfa_parse_init (void)
1044 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
1046 parse_info->charset = mk_BSetHandle (255, 20);
1047 parse_info->position = 0;
1048 parse_info->rule = 0;
1049 parse_info->root = NULL;
1051 parse_info->anyset = mk_BSet (&parse_info->charset);
1052 res_BSet (parse_info->charset, parse_info->anyset);
1053 add_BSet (parse_info->charset, parse_info->anyset, '\n');
1054 com_BSet (parse_info->charset, parse_info->anyset);
1055 parse_info->use_Tnode = parse_info->max_Tnode = 0;
1056 parse_info->charMap = NULL;
1057 parse_info->charMapSize = 0;
1058 parse_info->cmap = NULL;
1062 static void rm_dfa_parse (struct DFA_parse **dfap)
1065 assert (parse_info);
1067 rm_BSetHandle (&parse_info->charset);
1068 ifree (parse_info->charMap);
1073 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
1075 struct DFA_states *dfas;
1077 assert (poset_chunk > 10);
1081 poset = mk_SetType (poset_chunk);
1084 assert (parse_info->root);
1085 dfa_trav (parse_info->root);
1087 if (debug_dfa_followpos)
1089 init_DFA_states (&dfas, poset, STATE_HASH);
1101 struct DFA *dfa_init (void)
1105 dfa = imalloc (sizeof(*dfa));
1106 dfa->parse_info = dfa_parse_init ();
1107 dfa->state_info = NULL;
1109 dfa_parse_cmap_thompson (dfa);
1113 void dfa_set_cmap (struct DFA *dfa, void *vp,
1114 const char **(*cmap)(void *vp, const char **from, int len))
1116 dfa->parse_info->cmap = cmap;
1117 dfa->parse_info->cmap_data = vp;
1120 int dfa_parse (struct DFA *dfa, const char **pattern)
1125 assert (dfa->parse_info);
1126 do_parse (dfa->parse_info, pattern, &top);
1129 if ( !dfa->parse_info->root )
1130 dfa->parse_info->root = top;
1137 n->u.p[0] = dfa->parse_info->root;
1139 dfa->parse_info->root = n;
1144 void dfa_mkstate (struct DFA *dfa)
1148 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1149 dfa->no_states = dfa->state_info->no;
1150 dfa->states = dfa->state_info->sortarray;
1151 rm_dfa_parse (&dfa->parse_info);
1154 void dfa_delete (struct DFA **dfap)
1158 if ((*dfap)->parse_info)
1159 rm_dfa_parse (&(*dfap)->parse_info);
1160 if ((*dfap)->state_info)
1161 rm_DFA_states (&(*dfap)->state_info);