2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.6 1995-10-16 09:31:25 adam
10 * Revision 1.5 1995/10/02 15:17:58 adam
11 * Bug fix in dfa_delete.
13 * Revision 1.4 1995/09/28 09:18:52 adam
14 * Removed various preprocessor defines.
16 * Revision 1.3 1995/09/04 12:33:26 adam
17 * Various cleanup. YAZ util used instead.
19 * Revision 1.2 1995/01/25 11:30:50 adam
20 * Simple error reporting when parsing regular expressions.
21 * Memory usage reduced.
23 * Revision 1.1 1995/01/24 16:02:52 adam
24 * New private header file in dfa module (dfap.h).
25 * Module no longer uses yacc to parse regular expressions.
47 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
48 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
49 /* a character in range [ch[0]..ch[1]] */
50 /* otherwise ch[0] represents */
51 /* accepting no (-acceptno) */
53 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
54 unsigned nullable : 1;
55 Set firstpos; /* first positions */
56 Set lastpos; /* last positions */
59 struct Tblock { /* Tnode bucket (block) */
60 struct Tblock *next; /* pointer to next bucket */
61 struct Tnode *tarray; /* array of nodes in bucket */
65 #define STATE_HASH 199
66 #define POSET_CHUNK 100
68 int debug_dfa_trav = 0;
69 int debug_dfa_tran = 0;
70 int debug_dfa_followpos = 0;
73 static struct DFA_parse *parse_info = NULL;
76 static int inside_string;
77 static const unsigned char *expr_ptr;
78 static unsigned short *ctrl_chars;
79 static struct Tnode **posar;
82 static Set *followpos;
84 static struct Tnode *mk_Tnode (void);
85 static struct Tnode *mk_Tnode_cset (BSet charset);
86 static void term_Tnode (void);
92 mk_dfa_tran (struct DFA_states *dfas),
93 add_follow (Set lastpos, Set firstpos),
94 dfa_trav (struct Tnode *n),
95 init_followpos (void),
96 mk_dfa_tran (struct DFA_states *dfas),
97 pr_tran (struct DFA_states *dfas),
98 pr_verbose (struct DFA_states *dfas),
108 *str_char (unsigned c);
124 static int lookahead;
125 static unsigned look_ch;
126 static BSet look_chars;
128 static struct Tnode *expr_1 (void),
133 static struct Tnode *expr_1 (void)
135 struct Tnode *t1, *t2, *tn;
137 if (!(t1 = expr_2 ()))
139 while (lookahead == L_ALT)
142 if (!(t2 = expr_2 ()))
154 static struct Tnode *expr_2 (void)
156 struct Tnode *t1, *t2, *tn;
158 if (!(t1 = expr_3 ()))
160 while (lookahead == L_WILD ||
161 lookahead == L_ANYZ ||
162 lookahead == L_ANY ||
164 lookahead == L_CHAR ||
165 lookahead == L_CHARS)
167 if (!(t2 = expr_3 ()))
179 static struct Tnode *expr_3 (void)
181 struct Tnode *t1, *tn;
183 if (!(t1 = expr_4 ()))
185 if (lookahead == L_CLOS0)
193 else if (lookahead == L_CLOS1)
201 else if (lookahead == L_QUEST)
207 tn->u.p[1] = mk_Tnode();
208 tn->u.p[1]->pos = EPSILON;
214 static struct Tnode *expr_4 (void)
222 if (!(t1 = expr_1 ()))
224 if (lookahead == L_RP)
231 t1->pos = ++(parse_info->position);
232 t1->u.ch[1] = t1->u.ch[0] = look_ch;
236 t1 = mk_Tnode_cset (look_chars);
240 t1 = mk_Tnode_cset (parse_info->anyset);
246 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
247 t1->u.p[1] = mk_Tnode();
248 t1->u.p[1]->pos = EPSILON;
254 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
262 static void do_parse (dfap, s, cc, tnp)
263 struct DFA_parse *dfap;
265 const unsigned short *cc;
269 struct Tnode *t1, *t2;
271 for (i=0; cc[i]; i +=2)
273 ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
274 for (i=0; (ctrl_chars[i] = cc[i]); i ++)
278 ctrl_chars[i] = L_LP;
281 ctrl_chars[i] = L_RP;
284 ctrl_chars[i] = L_ANY;
287 ctrl_chars[i] = L_ALT;
290 ctrl_chars[i] = L_QUEST;
293 ctrl_chars[i] = L_CLOS1;
296 ctrl_chars[i] = L_CLOS0;
299 ctrl_chars[i] = L_END;
302 ctrl_chars[i] = L_START;
305 ctrl_chars[i] = L_ANY;
308 ctrl_chars[i] = L_ANYZ;
311 ctrl_chars[i] = L_WILD;
318 expr_ptr = (unsigned char *) *s;
323 if (t1 && lookahead == 0)
326 t2->pos = ++parse_info->position;
327 t2->u.ch[0] = -(++parse_info->rule);
338 if (lookahead == L_RP)
339 err_code = DFA_ERR_RP;
340 else if (lookahead == L_LP)
341 err_code = DFA_ERR_LP;
343 err_code = DFA_ERR_SYNTAX;
346 *s = (char *) expr_ptr;
350 static int nextchar (int *esc)
353 if (*expr_ptr == '\0' || isspace(*expr_ptr))
355 else if (*expr_ptr != '\\')
382 static int nextchar_set (int *esc)
384 if (*expr_ptr == ' ')
389 return nextchar (esc);
392 static int read_charset (void)
394 int i, ch0, ch1, esc0, esc1, cc = 0;
395 look_chars = mk_BSet (&parse_info->charset);
396 res_BSet (parse_info->charset, look_chars);
398 ch0 = nextchar_set (&esc0);
399 if (!esc0 && ch0 == '^')
402 ch0 = nextchar_set (&esc0);
406 if (!esc0 && ch0 == ']')
408 add_BSet (parse_info->charset, look_chars, ch0);
409 ch1 = nextchar_set (&esc1);
410 if (!esc1 && ch1 == '-')
412 if ((ch1 = nextchar_set (&esc1)) == 0)
414 if (!esc1 && ch1 == ']')
416 add_BSet (parse_info->charset, look_chars, '-');
419 for (i=ch0; ++i<=ch1;)
420 add_BSet (parse_info->charset, look_chars, i);
421 ch0 = nextchar_set (&esc0);
430 com_BSet (parse_info->charset, look_chars);
434 unsigned short dfa_thompson_chars[] =
448 unsigned short dfa_ccl_chars[] =
456 static int lex_sub(void)
459 const unsigned short *cc;
460 while ((look_ch = nextchar (&esc)) != 0)
465 inside_string = !inside_string;
467 else if (esc || inside_string)
469 else if (look_ch == '[')
470 return read_charset();
471 else if (look_ch == ' ')
475 for (cc = ctrl_chars; *cc; cc += 2)
483 static void lex (void)
485 lookahead = lex_sub ();
488 static const char *str_char (unsigned c)
505 sprintf (s+1, "x%02x", c);
527 static void out_char (int c)
529 printf ("%s", str_char (c));
532 static void term_Tnode (void)
534 struct Tblock *t, *t1;
535 for (t = parse_info->start; (t1 = t);)
543 static struct Tnode *mk_Tnode (void)
547 if (parse_info->use_Tnode == parse_info->max_Tnode)
549 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
550 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
553 if (parse_info->use_Tnode == 0)
554 parse_info->start = tnew;
556 parse_info->end->next = tnew;
557 parse_info->end = tnew;
559 parse_info->max_Tnode += TADD;
561 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
564 struct Tnode *mk_Tnode_cset (BSet charset)
566 struct Tnode *tn1, *tn0 = mk_Tnode();
567 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
573 tn0->pos = ++parse_info->position;
574 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
576 tn0->u.ch[1] = parse_info->charset->size;
579 tn0->u.ch[1] = ch1-1;
580 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
586 tn1 = tn0->u.p[1] = mk_Tnode();
588 tn1->pos = ++(parse_info->position);
589 if ((ch1 = travi_BSet (parse_info->charset, charset,
592 tn1->u.ch[1] = parse_info->charset->size;
595 tn1->u.ch[1] = ch1-1;
602 static void del_followpos (void)
607 static void init_pos (void)
609 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
610 * (1+parse_info->position));
613 static void del_pos (void)
618 static void add_follow (Set lastpos, Set firstpos)
622 followpos[lastpos->value] =
623 union_Set (poset, followpos[lastpos->value], firstpos);
624 lastpos = lastpos->next;
628 static void dfa_trav (struct Tnode *n)
633 dfa_trav (n->u.p[0]);
634 dfa_trav (n->u.p[1]);
635 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
636 n->firstpos = mk_Set (poset);
637 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
638 if (n->u.p[0]->nullable)
639 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
640 n->lastpos = mk_Set (poset);
641 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
642 if (n->u.p[1]->nullable)
643 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
645 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
647 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
648 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
649 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
650 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
655 dfa_trav (n->u.p[0]);
656 dfa_trav (n->u.p[1]);
657 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
659 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
660 n->u.p[1]->firstpos);
661 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
663 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
664 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
665 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
666 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
671 dfa_trav (n->u.p[0]);
672 n->nullable = n->u.p[0]->nullable;
673 n->firstpos = n->u.p[0]->firstpos;
674 n->lastpos = n->u.p[0]->lastpos;
675 add_follow (n->lastpos, n->firstpos);
680 dfa_trav (n->u.p[0]);
682 n->firstpos = n->u.p[0]->firstpos;
683 n->lastpos = n->u.p[0]->lastpos;
684 add_follow (n->lastpos, n->firstpos);
690 n->lastpos = mk_Set (poset);
691 n->firstpos = mk_Set (poset);
698 n->firstpos = mk_Set (poset);
699 n->firstpos = add_Set (poset, n->firstpos, n->pos);
700 n->lastpos = mk_Set (poset);
701 n->lastpos = add_Set (poset, n->lastpos, n->pos);
704 printf ("#%d", -n->u.ch[0]);
705 else if (n->u.ch[1] > n->u.ch[0])
708 out_char (n->u.ch[0]);
709 if (n->u.ch[1] > n->u.ch[0]+1)
711 out_char (n->u.ch[1]);
715 out_char (n->u.ch[0]);
719 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
720 printf (" firstpos :");
721 pr_Set (poset, n->firstpos);
722 printf (" lastpos :");
723 pr_Set (poset, n->lastpos);
727 static void init_followpos (void)
731 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
732 for (i = parse_info->position+1; --i >= 0; fa++)
733 *fa = mk_Set (poset);
736 static void mk_dfa_tran (struct DFA_states *dfas)
743 struct DFA_state *dfa_from, *dfa_to;
746 max_char = parse_info->charset->size;
747 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
749 tran_set = cp_Set (poset, parse_info->root->firstpos);
750 i = add_DFA_state (dfas, &tran_set, &dfa_from);
752 dfa_from->rule_no = 0;
753 while ((dfa_from = get_DFA_state (dfas)))
757 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
758 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
759 *pos_i++ = tran_set->value;
760 else if (c < 0 && (i == 0 || c > i))
763 dfa_from->rule_no = -i;
765 for (char_1 = 0; char_1 <= max_char; char_1++)
768 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
769 if (posar[i]->u.ch[1] >= char_1
770 && (c=posar[i]->u.ch[0]) < char_0)
776 if (char_0 > max_char)
781 tran_set = mk_Set (poset);
782 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
784 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
785 char_1 = c - 1; /* forward chunk */
786 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
787 char_1 = c; /* backward chunk */
788 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
789 tran_set = union_Set (poset, tran_set, followpos[i]);
793 add_DFA_state (dfas, &tran_set, &dfa_to);
794 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
799 sort_DFA_states (dfas);
802 static void pr_tran (struct DFA_states *dfas)
805 struct DFA_tran *tran;
806 int prev_no, i, c, no;
808 for (no=0; no < dfas->no; ++no)
810 s = dfas->sortarray[no];
811 assert (s->no == no);
812 printf ("DFA state %d", no);
814 printf ("#%d", s->rule_no);
816 pr_Set (poset, s->set);
818 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
820 if (prev_no != tran->to)
824 printf (" goto %d on [", tran->to);
827 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
828 printf ("%s", str_char(c));
836 static void pr_verbose (struct DFA_states *dfas)
840 printf ("%d/%d tree nodes used, %d bytes each\n",
841 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
842 k = inf_BSetHandle (parse_info->charset, &i, &j);
843 printf ("%ld/%ld character sets, %d bytes each\n",
844 i/k, j/k, k*sizeof(BSetWord));
845 k = inf_SetType (poset, &i, &j);
846 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
847 printf ("%d DFA states\n", dfas->no);
850 static void pr_followpos (void)
853 printf ("\nfollowsets:\n");
854 for (i=1; i <= parse_info->position; i++)
857 pr_Set (poset, followpos[i]);
860 if (posar[i]->u.ch[0] < 0)
861 printf ("#%d", -posar[i]->u.ch[0]);
862 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
865 out_char (posar[i]->u.ch[0]);
866 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
868 out_char (posar[i]->u.ch[1]);
872 out_char (posar[i]->u.ch[0]);
878 static struct DFA_parse *dfa_parse_init (void)
880 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
882 parse_info->charset = mk_BSetHandle (255, 20);
883 parse_info->position = 0;
884 parse_info->rule = 0;
885 parse_info->root = NULL;
887 parse_info->anyset = mk_BSet (&parse_info->charset);
888 res_BSet (parse_info->charset, parse_info->anyset);
889 add_BSet (parse_info->charset, parse_info->anyset, '\n');
890 com_BSet (parse_info->charset, parse_info->anyset);
891 parse_info->use_Tnode = parse_info->max_Tnode = 0;
895 static void rm_dfa_parse (struct DFA_parse **dfap)
900 rm_BSetHandle (&parse_info->charset);
905 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
907 struct DFA_states *dfas;
909 assert (poset_chunk > 10);
913 poset = mk_SetType (poset_chunk);
916 dfa_trav (parse_info->root);
918 if (debug_dfa_followpos)
920 init_DFA_states (&dfas, poset, STATE_HASH);
932 struct DFA *dfa_init (void)
936 dfa = imalloc (sizeof(*dfa));
937 dfa->parse_info = dfa_parse_init ();
938 dfa->state_info = NULL;
943 int dfa_parse (struct DFA *dfa, char **pattern)
948 do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top);
951 if ( !dfa->parse_info->root )
952 dfa->parse_info->root = top;
959 n->u.p[0] = dfa->parse_info->root;
961 dfa->parse_info->root = n;
966 void dfa_mkstate (struct DFA *dfa)
970 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
971 dfa->no_states = dfa->state_info->no;
972 dfa->states = dfa->state_info->sortarray;
973 rm_dfa_parse (&dfa->parse_info);
976 void dfa_delete (struct DFA **dfap)
980 if ((*dfap)->parse_info)
981 rm_dfa_parse (&(*dfap)->parse_info);
982 if ((*dfap)->state_info)
983 rm_DFA_states (&(*dfap)->state_info);