2 * Copyright (C) 1994-1997, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.16 1997-09-05 15:29:57 adam
8 * Changed prototype for chr_map_input - added const.
9 * Added support for C++, headers uses extern "C" for public definitions.
11 * Revision 1.15 1997/02/10 10:19:20 adam
12 * Added facility for open character sets, eg [a-].
14 * Revision 1.14 1996/10/29 13:57:22 adam
15 * Include of zebrautl.h instead of alexutil.h.
17 * Revision 1.13 1996/06/17 14:24:08 adam
18 * Bug fix: read_charset didn't handle character mapping.
20 * Revision 1.12 1996/06/04 10:20:02 adam
21 * Added support for character mapping.
23 * Revision 1.11 1996/01/08 19:15:24 adam
24 * Allow single $ in expressions.
26 * Revision 1.10 1996/01/08 09:09:17 adam
27 * Function dfa_parse got 'const' string argument.
28 * New functions to define char mappings made public.
30 * Revision 1.9 1995/12/06 12:24:58 adam
31 * Removed verbatim mode code.
33 * Revision 1.8 1995/12/06 09:09:58 adam
34 * Work on left and right anchors.
36 * Revision 1.7 1995/11/27 09:23:02 adam
37 * New berbatim hook in regular expressions. "[]n ..".
39 * Revision 1.6 1995/10/16 09:31:25 adam
42 * Revision 1.5 1995/10/02 15:17:58 adam
43 * Bug fix in dfa_delete.
45 * Revision 1.4 1995/09/28 09:18:52 adam
46 * Removed various preprocessor defines.
48 * Revision 1.3 1995/09/04 12:33:26 adam
49 * Various cleanup. YAZ util used instead.
51 * Revision 1.2 1995/01/25 11:30:50 adam
52 * Simple error reporting when parsing regular expressions.
53 * Memory usage reduced.
55 * Revision 1.1 1995/01/24 16:02:52 adam
56 * New private header file in dfa module (dfap.h).
57 * Module no longer uses yacc to parse regular expressions.
71 #define DFA_OPEN_RANGE 1
81 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
82 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
83 /* a character in range [ch[0]..ch[1]] */
84 /* otherwise ch[0] represents */
85 /* accepting no (-acceptno) */
87 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
88 unsigned nullable : 1;
89 Set firstpos; /* first positions */
90 Set lastpos; /* last positions */
93 struct Tblock { /* Tnode bucket (block) */
94 struct Tblock *next; /* pointer to next bucket */
95 struct Tnode *tarray; /* array of nodes in bucket */
99 #define STATE_HASH 199
100 #define POSET_CHUNK 100
102 int debug_dfa_trav = 0;
103 int debug_dfa_tran = 0;
104 int debug_dfa_followpos = 0;
107 static struct DFA_parse *parse_info = NULL;
110 static int inside_string;
111 static const unsigned char *expr_ptr;
112 static struct Tnode **posar;
114 static SetType poset;
115 static Set *followpos;
117 static struct Tnode *mk_Tnode (void);
118 static struct Tnode *mk_Tnode_cset (BSet charset);
119 static void term_Tnode (void);
122 del_followpos (void),
125 mk_dfa_tran (struct DFA_states *dfas),
126 add_follow (Set lastpos, Set firstpos),
127 dfa_trav (struct Tnode *n),
128 init_followpos (void),
129 pr_tran (struct DFA_states *dfas),
130 pr_verbose (struct DFA_states *dfas),
140 *str_char (unsigned c);
156 static int lookahead;
157 static unsigned look_ch;
158 static BSet look_chars;
160 static struct Tnode *expr_1 (void),
165 static struct Tnode *expr_1 (void)
167 struct Tnode *t1, *t2, *tn;
169 if (!(t1 = expr_2 ()))
171 while (lookahead == L_ALT)
174 if (!(t2 = expr_2 ()))
186 static struct Tnode *expr_2 (void)
188 struct Tnode *t1, *t2, *tn;
190 if (!(t1 = expr_3 ()))
192 while (lookahead == L_WILD ||
193 lookahead == L_ANYZ ||
194 lookahead == L_ANY ||
196 lookahead == L_CHAR ||
197 lookahead == L_CHARS)
199 if (!(t2 = expr_3 ()))
211 static struct Tnode *expr_3 (void)
213 struct Tnode *t1, *tn;
215 if (!(t1 = expr_4 ()))
217 if (lookahead == L_CLOS0)
225 else if (lookahead == L_CLOS1)
233 else if (lookahead == L_QUEST)
239 tn->u.p[1] = mk_Tnode();
240 tn->u.p[1]->pos = EPSILON;
246 static struct Tnode *expr_4 (void)
254 if (!(t1 = expr_1 ()))
256 if (lookahead == L_RP)
263 t1->pos = ++parse_info->position;
264 t1->u.ch[1] = t1->u.ch[0] = look_ch;
268 t1 = mk_Tnode_cset (look_chars);
272 t1 = mk_Tnode_cset (parse_info->anyset);
278 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
279 t1->u.p[1] = mk_Tnode();
280 t1->u.p[1]->pos = EPSILON;
286 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
294 static void do_parse (struct DFA_parse *dfap, const char **s,
297 int start_anchor_flag = 0;
298 struct Tnode *t1, *t2, *tn;
302 expr_ptr = (const unsigned char *) *s;
306 if (lookahead == L_START)
308 start_anchor_flag = 1;
311 if (lookahead == L_END)
314 t1->pos = ++parse_info->position;
315 t1->u.ch[1] = t1->u.ch[0] = '\n';
321 if (t1 && lookahead == L_END)
324 t2->pos = ++parse_info->position;
325 t2->u.ch[1] = t2->u.ch[0] = '\n';
336 if (t1 && lookahead == 0)
339 t2->pos = ++parse_info->position;
340 t2->u.ch[0] = -(++parse_info->rule);
341 t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
352 if (lookahead == L_RP)
353 err_code = DFA_ERR_RP;
354 else if (lookahead == L_LP)
355 err_code = DFA_ERR_LP;
357 err_code = DFA_ERR_SYNTAX;
360 *s = (const char *) expr_ptr;
363 static int nextchar (int *esc)
366 if (*expr_ptr == '\0')
368 else if (*expr_ptr != '\\')
397 static int nextchar_set (int *esc)
399 if (*expr_ptr == ' ')
404 return nextchar (esc);
407 static int read_charset (void)
409 int i, ch0, ch1, esc0, esc1, cc = 0;
410 look_chars = mk_BSet (&parse_info->charset);
411 res_BSet (parse_info->charset, look_chars);
413 ch0 = nextchar_set (&esc0);
414 if (!esc0 && ch0 == '^')
417 ch0 = nextchar_set (&esc0);
421 if (!esc0 && ch0 == ']')
423 if (parse_info->cmap)
427 const char *mcp = mapfrom;
429 mapto = (*parse_info->cmap)(&mcp, 1);
433 add_BSet (parse_info->charset, look_chars, ch0);
434 ch1 = nextchar_set (&esc1);
435 if (!esc1 && ch1 == '-')
438 if ((ch1 = nextchar_set (&esc1)) == 0)
441 if (!esc1 && ch1 == ']')
447 if (!esc1 && ch1 == ']')
449 add_BSet (parse_info->charset, look_chars, '-');
453 if (!open_range && parse_info->cmap)
457 const char *mcp = mapfrom;
459 mapto = (*parse_info->cmap) (&mcp, 1);
463 for (i=ch0; ++i<=ch1;)
464 add_BSet (parse_info->charset, look_chars, i);
466 ch0 = nextchar_set (&esc0);
477 com_BSet (parse_info->charset, look_chars);
481 static int map_l_char (void)
484 const char *cp0 = (const char *) (expr_ptr-1);
485 int i = 0, len = strlen(cp0);
487 if (cp0[0] == 1 && cp0[1])
493 if (!parse_info->cmap)
496 mapto = (*parse_info->cmap) (&cp0, len);
499 expr_ptr = (const unsigned char *) cp0;
500 look_ch = mapto[i][0];
501 logf (LOG_DEBUG, "map from %c to %d", expr_ptr[-1], look_ch);
505 static int lex_sub(void)
508 while ((look_ch = nextchar (&esc)) != 0)
512 return map_l_char ();
513 inside_string = !inside_string;
515 else if (esc || inside_string)
516 return map_l_char ();
517 else if (look_ch == '[')
518 return read_charset();
522 for (cc = parse_info->charMap; *cc; cc += 2)
529 return map_l_char ();
534 static void lex (void)
536 lookahead = lex_sub ();
539 static const char *str_char (unsigned c)
556 sprintf (s+1, "x%02x", c);
578 static void out_char (int c)
580 printf ("%s", str_char (c));
583 static void term_Tnode (void)
585 struct Tblock *t, *t1;
586 for (t = parse_info->start; (t1 = t);)
594 static struct Tnode *mk_Tnode (void)
598 if (parse_info->use_Tnode == parse_info->max_Tnode)
600 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
601 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
604 if (parse_info->use_Tnode == 0)
605 parse_info->start = tnew;
607 parse_info->end->next = tnew;
608 parse_info->end = tnew;
610 parse_info->max_Tnode += TADD;
612 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
615 struct Tnode *mk_Tnode_cset (BSet charset)
617 struct Tnode *tn1, *tn0 = mk_Tnode();
618 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
624 tn0->pos = ++parse_info->position;
625 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
627 tn0->u.ch[1] = parse_info->charset->size;
630 tn0->u.ch[1] = ch1-1;
631 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
637 tn1 = tn0->u.p[1] = mk_Tnode();
639 tn1->pos = ++(parse_info->position);
640 if ((ch1 = travi_BSet (parse_info->charset, charset,
643 tn1->u.ch[1] = parse_info->charset->size;
646 tn1->u.ch[1] = ch1-1;
653 static void del_followpos (void)
658 static void init_pos (void)
660 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
661 * (1+parse_info->position));
664 static void del_pos (void)
669 static void add_follow (Set lastpos, Set firstpos)
673 followpos[lastpos->value] =
674 union_Set (poset, followpos[lastpos->value], firstpos);
675 lastpos = lastpos->next;
679 static void dfa_trav (struct Tnode *n)
684 dfa_trav (n->u.p[0]);
685 dfa_trav (n->u.p[1]);
686 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
687 n->firstpos = mk_Set (poset);
688 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
689 if (n->u.p[0]->nullable)
690 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
691 n->lastpos = mk_Set (poset);
692 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
693 if (n->u.p[1]->nullable)
694 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
696 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
698 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
699 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
700 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
701 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
706 dfa_trav (n->u.p[0]);
707 dfa_trav (n->u.p[1]);
708 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
710 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
711 n->u.p[1]->firstpos);
712 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
714 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
715 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
716 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
717 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
722 dfa_trav (n->u.p[0]);
723 n->nullable = n->u.p[0]->nullable;
724 n->firstpos = n->u.p[0]->firstpos;
725 n->lastpos = n->u.p[0]->lastpos;
726 add_follow (n->lastpos, n->firstpos);
731 dfa_trav (n->u.p[0]);
733 n->firstpos = n->u.p[0]->firstpos;
734 n->lastpos = n->u.p[0]->lastpos;
735 add_follow (n->lastpos, n->firstpos);
741 n->lastpos = mk_Set (poset);
742 n->firstpos = mk_Set (poset);
749 n->firstpos = mk_Set (poset);
750 n->firstpos = add_Set (poset, n->firstpos, n->pos);
751 n->lastpos = mk_Set (poset);
752 n->lastpos = add_Set (poset, n->lastpos, n->pos);
755 printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
756 else if (n->u.ch[1] > n->u.ch[0])
759 out_char (n->u.ch[0]);
760 if (n->u.ch[1] > n->u.ch[0]+1)
762 out_char (n->u.ch[1]);
766 out_char (n->u.ch[0]);
770 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
771 printf (" firstpos :");
772 pr_Set (poset, n->firstpos);
773 printf (" lastpos :");
774 pr_Set (poset, n->lastpos);
778 static void init_followpos (void)
782 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
783 for (i = parse_info->position+1; --i >= 0; fa++)
784 *fa = mk_Set (poset);
787 static void mk_dfa_tran (struct DFA_states *dfas)
794 struct DFA_state *dfa_from, *dfa_to;
797 max_char = parse_info->charset->size;
798 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
800 tran_set = cp_Set (poset, parse_info->root->firstpos);
801 i = add_DFA_state (dfas, &tran_set, &dfa_from);
803 dfa_from->rule_no = 0;
804 while ((dfa_from = get_DFA_state (dfas)))
808 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
809 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
810 *pos_i++ = tran_set->value;
815 c = posar[tran_set->value]->u.ch[1];
820 dfa_from->rule_no = -i;
821 dfa_from->rule_nno = -j;
823 for (char_1 = 0; char_1 <= max_char; char_1++)
826 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
827 if (posar[i]->u.ch[1] >= char_1
828 && (c=posar[i]->u.ch[0]) < char_0)
834 if (char_0 > max_char)
839 tran_set = mk_Set (poset);
840 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
842 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
843 char_1 = c - 1; /* forward chunk */
844 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
845 char_1 = c; /* backward chunk */
846 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
847 tran_set = union_Set (poset, tran_set, followpos[i]);
851 add_DFA_state (dfas, &tran_set, &dfa_to);
852 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
857 sort_DFA_states (dfas);
860 static void pr_tran (struct DFA_states *dfas)
863 struct DFA_tran *tran;
864 int prev_no, i, c, no;
866 for (no=0; no < dfas->no; ++no)
868 s = dfas->sortarray[no];
869 assert (s->no == no);
870 printf ("DFA state %d", no);
872 printf ("#%d", s->rule_no);
874 printf (" #%d", s->rule_nno);
877 pr_Set (poset, s->set);
879 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
881 if (prev_no != tran->to)
885 printf (" goto %d on [", tran->to);
888 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
889 printf ("%s", str_char(c));
897 static void pr_verbose (struct DFA_states *dfas)
901 printf ("%d/%d tree nodes used, %d bytes each\n",
902 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
903 k = inf_BSetHandle (parse_info->charset, &i, &j);
904 printf ("%ld/%ld character sets, %d bytes each\n",
905 i/k, j/k, k*sizeof(BSetWord));
906 k = inf_SetType (poset, &i, &j);
907 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
908 printf ("%d DFA states\n", dfas->no);
911 static void pr_followpos (void)
914 printf ("\nfollowsets:\n");
915 for (i=1; i <= parse_info->position; i++)
918 pr_Set (poset, followpos[i]);
921 if (posar[i]->u.ch[0] < 0)
922 printf ("#%d", -posar[i]->u.ch[0]);
923 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
926 out_char (posar[i]->u.ch[0]);
927 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
929 out_char (posar[i]->u.ch[1]);
933 out_char (posar[i]->u.ch[0]);
939 void dfa_parse_cmap_clean (struct DFA *d)
941 struct DFA_parse *dfa = d->parse_info;
946 dfa->charMapSize = 7;
947 dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
952 void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
954 struct DFA_parse *dfa = d->parse_info;
959 for (cp = cmap; *cp; cp += 2)
961 size = cp - cmap + 1;
962 if (size > dfa->charMapSize)
965 ifree (dfa->charMap);
966 dfa->charMapSize = size;
967 dfa->charMap = imalloc (size * sizeof(*dfa->charMap));
969 memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
972 void dfa_parse_cmap_del (struct DFA *d, int from)
974 struct DFA_parse *dfa = d->parse_info;
978 for (cc = dfa->charMap; *cc; cc += 2)
981 while ((cc[0] = cc[2]))
990 void dfa_parse_cmap_add (struct DFA *d, int from, int to)
992 struct DFA_parse *dfa = d->parse_info;
997 for (cc = dfa->charMap; *cc; cc += 2)
1003 indx = cc - dfa->charMap;
1004 size = dfa->charMapSize;
1007 int *cn = imalloc ((size+16) * sizeof(*dfa->charMap));
1008 memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
1009 ifree (dfa->charMap);
1011 dfa->charMapSize = size+16;
1013 dfa->charMap[indx] = from;
1014 dfa->charMap[indx+1] = to;
1015 dfa->charMap[indx+2] = 0;
1018 void dfa_parse_cmap_thompson (struct DFA *d)
1020 static int thompson_chars[] =
1036 dfa_parse_cmap_new (d, thompson_chars);
1039 static struct DFA_parse *dfa_parse_init (void)
1041 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
1043 parse_info->charset = mk_BSetHandle (255, 20);
1044 parse_info->position = 0;
1045 parse_info->rule = 0;
1046 parse_info->root = NULL;
1048 parse_info->anyset = mk_BSet (&parse_info->charset);
1049 res_BSet (parse_info->charset, parse_info->anyset);
1050 add_BSet (parse_info->charset, parse_info->anyset, '\n');
1051 com_BSet (parse_info->charset, parse_info->anyset);
1052 parse_info->use_Tnode = parse_info->max_Tnode = 0;
1053 parse_info->charMap = NULL;
1054 parse_info->charMapSize = 0;
1055 parse_info->cmap = NULL;
1059 static void rm_dfa_parse (struct DFA_parse **dfap)
1062 assert (parse_info);
1064 rm_BSetHandle (&parse_info->charset);
1065 ifree (parse_info->charMap);
1070 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
1072 struct DFA_states *dfas;
1074 assert (poset_chunk > 10);
1078 poset = mk_SetType (poset_chunk);
1081 assert (parse_info->root);
1082 dfa_trav (parse_info->root);
1084 if (debug_dfa_followpos)
1086 init_DFA_states (&dfas, poset, STATE_HASH);
1098 struct DFA *dfa_init (void)
1102 dfa = imalloc (sizeof(*dfa));
1103 dfa->parse_info = dfa_parse_init ();
1104 dfa->state_info = NULL;
1106 dfa_parse_cmap_thompson (dfa);
1110 void dfa_set_cmap (struct DFA *dfa,
1111 const char **(*cmap)(const char **from, int len))
1113 dfa->parse_info->cmap = cmap;
1116 int dfa_parse (struct DFA *dfa, const char **pattern)
1121 assert (dfa->parse_info);
1122 do_parse (dfa->parse_info, pattern, &top);
1125 if ( !dfa->parse_info->root )
1126 dfa->parse_info->root = top;
1133 n->u.p[0] = dfa->parse_info->root;
1135 dfa->parse_info->root = n;
1140 void dfa_mkstate (struct DFA *dfa)
1144 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1145 dfa->no_states = dfa->state_info->no;
1146 dfa->states = dfa->state_info->sortarray;
1147 rm_dfa_parse (&dfa->parse_info);
1150 void dfa_delete (struct DFA **dfap)
1154 if ((*dfap)->parse_info)
1155 rm_dfa_parse (&(*dfap)->parse_info);
1156 if ((*dfap)->state_info)
1157 rm_DFA_states (&(*dfap)->state_info);