2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.5 1995-10-02 15:17:58 adam
8 * Bug fix in dfa_delete.
10 * Revision 1.4 1995/09/28 09:18:52 adam
11 * Removed various preprocessor defines.
13 * Revision 1.3 1995/09/04 12:33:26 adam
14 * Various cleanup. YAZ util used instead.
16 * Revision 1.2 1995/01/25 11:30:50 adam
17 * Simple error reporting when parsing regular expressions.
18 * Memory usage reduced.
20 * Revision 1.1 1995/01/24 16:02:52 adam
21 * New private header file in dfa module (dfap.h).
22 * Module no longer uses yacc to parse regular expressions.
44 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
45 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
46 /* a character in range [ch[0]..ch[1]] */
47 /* otherwise ch[0] represents */
48 /* accepting no (-acceptno) */
50 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
51 unsigned nullable : 1;
52 Set firstpos; /* first positions */
53 Set lastpos; /* last positions */
56 struct Tblock { /* Tnode bucket (block) */
57 struct Tblock *next; /* pointer to next bucket */
58 struct Tnode *tarray; /* array of nodes in bucket */
62 #define STATE_HASH 199
63 #define POSET_CHUNK 100
65 int debug_dfa_trav = 0;
66 int debug_dfa_tran = 0;
67 int debug_dfa_followpos = 0;
70 static struct DFA_parse *parse_info = NULL;
73 static int inside_string;
74 static const unsigned char *expr_ptr;
75 static unsigned short *ctrl_chars;
76 static struct Tnode **posar;
79 static Set *followpos;
81 static struct Tnode *mk_Tnode (void);
82 static struct Tnode *mk_Tnode_cset (BSet charset);
83 static void term_Tnode (void);
89 mk_dfa_tran (struct DFA_states *dfas),
90 add_follow (Set lastpos, Set firstpos),
91 dfa_trav (struct Tnode *n),
92 init_followpos (void),
93 mk_dfa_tran (struct DFA_states *dfas),
94 pr_tran (struct DFA_states *dfas),
95 pr_verbose (struct DFA_states *dfas),
105 *str_char (unsigned c);
121 static int lookahead;
122 static unsigned look_ch;
123 static BSet look_chars;
125 static struct Tnode *expr_1 (void),
130 static struct Tnode *expr_1 (void)
132 struct Tnode *t1, *t2, *tn;
134 if (!(t1 = expr_2 ()))
136 while (lookahead == L_ALT)
139 if (!(t2 = expr_2 ()))
151 static struct Tnode *expr_2 (void)
153 struct Tnode *t1, *t2, *tn;
155 if (!(t1 = expr_3 ()))
157 while (lookahead == L_WILD ||
158 lookahead == L_ANYZ ||
159 lookahead == L_ANY ||
161 lookahead == L_CHAR ||
162 lookahead == L_CHARS)
164 if (!(t2 = expr_3 ()))
176 static struct Tnode *expr_3 (void)
178 struct Tnode *t1, *tn;
180 if (!(t1 = expr_4 ()))
182 if (lookahead == L_CLOS0)
190 else if (lookahead == L_CLOS1)
198 else if (lookahead == L_QUEST)
204 tn->u.p[1] = mk_Tnode();
205 tn->u.p[1]->pos = EPSILON;
211 static struct Tnode *expr_4 (void)
219 if (!(t1 = expr_1 ()))
221 if (lookahead == L_RP)
228 t1->pos = ++(parse_info->position);
229 t1->u.ch[1] = t1->u.ch[0] = look_ch;
233 t1 = mk_Tnode_cset (look_chars);
237 t1 = mk_Tnode_cset (parse_info->anyset);
243 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
244 t1->u.p[1] = mk_Tnode();
245 t1->u.p[1]->pos = EPSILON;
251 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
259 static void do_parse (dfap, s, cc, tnp)
260 struct DFA_parse *dfap;
262 const unsigned short *cc;
266 struct Tnode *t1, *t2;
268 for (i=0; cc[i]; i +=2)
270 ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
271 for (i=0; (ctrl_chars[i] = cc[i]); i ++)
275 ctrl_chars[i] = L_LP;
278 ctrl_chars[i] = L_RP;
281 ctrl_chars[i] = L_ANY;
284 ctrl_chars[i] = L_ALT;
287 ctrl_chars[i] = L_QUEST;
290 ctrl_chars[i] = L_CLOS1;
293 ctrl_chars[i] = L_CLOS0;
296 ctrl_chars[i] = L_END;
299 ctrl_chars[i] = L_START;
302 ctrl_chars[i] = L_ANY;
305 ctrl_chars[i] = L_ANYZ;
308 ctrl_chars[i] = L_WILD;
315 expr_ptr = (unsigned char *) *s;
320 if (t1 && lookahead == 0)
323 t2->pos = ++parse_info->position;
324 t2->u.ch[0] = -(++parse_info->rule);
335 if (lookahead == L_RP)
336 err_code = DFA_ERR_RP;
337 else if (lookahead == L_LP)
338 err_code = DFA_ERR_LP;
340 err_code = DFA_ERR_SYNTAX;
343 *s = (char *) expr_ptr;
347 static int nextchar (int *esc)
350 if (*expr_ptr == '\0' || isspace(*expr_ptr))
352 else if (*expr_ptr != '\\')
379 static int nextchar_set (int *esc)
381 if (*expr_ptr == ' ')
386 return nextchar (esc);
389 static int read_charset (void)
391 int i, ch0, ch1, esc0, esc1, cc = 0;
392 look_chars = mk_BSet (&parse_info->charset);
393 res_BSet (parse_info->charset, look_chars);
395 ch0 = nextchar_set (&esc0);
396 if (!esc0 && ch0 == '^')
399 ch0 = nextchar_set (&esc0);
403 if (!esc0 && ch0 == ']')
405 add_BSet (parse_info->charset, look_chars, ch0);
406 ch1 = nextchar_set (&esc1);
407 if (!esc1 && ch1 == '-')
409 if ((ch1 = nextchar_set (&esc1)) == 0)
411 if (!esc1 && ch1 == ']')
413 add_BSet (parse_info->charset, look_chars, '-');
416 for (i=ch0; ++i<=ch1;)
417 add_BSet (parse_info->charset, look_chars, i);
418 ch0 = nextchar_set (&esc0);
427 com_BSet (parse_info->charset, look_chars);
431 unsigned short dfa_thompson_chars[] =
445 unsigned short dfa_ccl_chars[] =
453 static int lex_sub(void)
456 const unsigned short *cc;
457 while ((look_ch = nextchar (&esc)) != 0)
462 inside_string = !inside_string;
464 else if (esc || inside_string)
466 else if (look_ch == '[')
467 return read_charset();
468 else if (look_ch == ' ')
472 for (cc = ctrl_chars; *cc; cc += 2)
480 static void lex (void)
482 lookahead = lex_sub ();
485 static const char *str_char (unsigned c)
502 sprintf (s+1, "x%02x", c);
524 static void out_char (int c)
526 printf ("%s", str_char (c));
529 static void term_Tnode (void)
531 struct Tblock *t, *t1;
532 for (t = parse_info->start; (t1 = t);)
540 static struct Tnode *mk_Tnode (void)
544 if (parse_info->use_Tnode == parse_info->max_Tnode)
546 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
547 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
550 if (parse_info->use_Tnode == 0)
551 parse_info->start = tnew;
553 parse_info->end->next = tnew;
554 parse_info->end = tnew;
556 parse_info->max_Tnode += TADD;
558 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
561 struct Tnode *mk_Tnode_cset (BSet charset)
563 struct Tnode *tn1, *tn0 = mk_Tnode();
564 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
570 tn0->pos = ++parse_info->position;
571 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
573 tn0->u.ch[1] = parse_info->charset->size;
576 tn0->u.ch[1] = ch1-1;
577 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
583 tn1 = tn0->u.p[1] = mk_Tnode();
585 tn1->pos = ++(parse_info->position);
586 if ((ch1 = travi_BSet (parse_info->charset, charset,
589 tn1->u.ch[1] = parse_info->charset->size;
592 tn1->u.ch[1] = ch1-1;
599 static void del_followpos (void)
604 static void init_pos (void)
606 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
607 * (1+parse_info->position));
610 static void del_pos (void)
615 static void add_follow (Set lastpos, Set firstpos)
619 followpos[lastpos->value] =
620 union_Set (poset, followpos[lastpos->value], firstpos);
621 lastpos = lastpos->next;
625 static void dfa_trav (struct Tnode *n)
630 dfa_trav (n->u.p[0]);
631 dfa_trav (n->u.p[1]);
632 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
633 n->firstpos = mk_Set (poset);
634 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
635 if (n->u.p[0]->nullable)
636 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
637 n->lastpos = mk_Set (poset);
638 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
639 if (n->u.p[1]->nullable)
640 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
642 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
644 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
645 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
646 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
647 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
652 dfa_trav (n->u.p[0]);
653 dfa_trav (n->u.p[1]);
654 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
656 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
657 n->u.p[1]->firstpos);
658 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
660 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
661 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
662 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
663 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
668 dfa_trav (n->u.p[0]);
669 n->nullable = n->u.p[0]->nullable;
670 n->firstpos = n->u.p[0]->firstpos;
671 n->lastpos = n->u.p[0]->lastpos;
672 add_follow (n->lastpos, n->firstpos);
677 dfa_trav (n->u.p[0]);
679 n->firstpos = n->u.p[0]->firstpos;
680 n->lastpos = n->u.p[0]->lastpos;
681 add_follow (n->lastpos, n->firstpos);
687 n->lastpos = mk_Set (poset);
688 n->firstpos = mk_Set (poset);
695 n->firstpos = mk_Set (poset);
696 n->firstpos = add_Set (poset, n->firstpos, n->pos);
697 n->lastpos = mk_Set (poset);
698 n->lastpos = add_Set (poset, n->lastpos, n->pos);
701 printf ("#%d", -n->u.ch[0]);
702 else if (n->u.ch[1] > n->u.ch[0])
705 out_char (n->u.ch[0]);
706 if (n->u.ch[1] > n->u.ch[0]+1)
708 out_char (n->u.ch[1]);
712 out_char (n->u.ch[0]);
716 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
717 printf (" firstpos :");
718 pr_Set (poset, n->firstpos);
719 printf (" lastpos :");
720 pr_Set (poset, n->lastpos);
724 static void init_followpos (void)
728 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
729 for (i = parse_info->position+1; --i >= 0; fa++)
730 *fa = mk_Set (poset);
733 static void mk_dfa_tran (struct DFA_states *dfas)
740 struct DFA_state *dfa_from, *dfa_to;
743 max_char = parse_info->charset->size;
744 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
746 tran_set = cp_Set (poset, parse_info->root->firstpos);
747 i = add_DFA_state (dfas, &tran_set, &dfa_from);
749 dfa_from->rule_no = 0;
750 while ((dfa_from = get_DFA_state (dfas)))
754 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
755 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
756 *pos_i++ = tran_set->value;
757 else if (c < 0 && (i == 0 || c > i))
760 dfa_from->rule_no = -i;
762 for (char_1 = 0; char_1 <= max_char; char_1++)
765 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
766 if (posar[i]->u.ch[1] >= char_1)
767 if ((c=posar[i]->u.ch[0]) < char_0)
774 if (char_0 > max_char)
776 tran_set = mk_Set (poset);
777 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
778 if ((c=posar[i]->u.ch[1]) >= char_0)
779 if (posar[i]->u.ch[0] <= char_0)
783 tran_set = union_Set (poset, tran_set, followpos[i]);
785 else if (c <= char_1)
789 add_DFA_state (dfas, &tran_set, &dfa_to);
790 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
795 sort_DFA_states (dfas);
798 static void pr_tran (struct DFA_states *dfas)
801 struct DFA_tran *tran;
802 int prev_no, i, c, no;
804 for (no=0; no < dfas->no; ++no)
806 s = dfas->sortarray[no];
807 assert (s->no == no);
808 printf ("DFA state %d", no);
810 printf ("#%d", s->rule_no);
812 pr_Set (poset, s->set);
814 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
816 if (prev_no != tran->to)
820 printf (" goto %d on [", tran->to);
823 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
824 printf ("%s", str_char(c));
832 static void pr_verbose (struct DFA_states *dfas)
836 printf ("%d/%d tree nodes used, %d bytes each\n",
837 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
838 k = inf_BSetHandle (parse_info->charset, &i, &j);
839 printf ("%ld/%ld character sets, %d bytes each\n",
840 i/k, j/k, k*sizeof(BSetWord));
841 k = inf_SetType (poset, &i, &j);
842 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
843 printf ("%d DFA states\n", dfas->no);
846 static void pr_followpos (void)
849 printf ("\nfollowsets:\n");
850 for (i=1; i <= parse_info->position; i++)
853 pr_Set (poset, followpos[i]);
856 if (posar[i]->u.ch[0] < 0)
857 printf ("#%d", -posar[i]->u.ch[0]);
858 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
861 out_char (posar[i]->u.ch[0]);
862 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
864 out_char (posar[i]->u.ch[1]);
868 out_char (posar[i]->u.ch[0]);
874 static struct DFA_parse *dfa_parse_init (void)
876 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
878 parse_info->charset = mk_BSetHandle (255, 20);
879 parse_info->position = 0;
880 parse_info->rule = 0;
881 parse_info->root = NULL;
883 parse_info->anyset = mk_BSet (&parse_info->charset);
884 res_BSet (parse_info->charset, parse_info->anyset);
885 add_BSet (parse_info->charset, parse_info->anyset, '\n');
886 com_BSet (parse_info->charset, parse_info->anyset);
887 parse_info->use_Tnode = parse_info->max_Tnode = 0;
891 static void rm_dfa_parse (struct DFA_parse **dfap)
896 rm_BSetHandle (&parse_info->charset);
901 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
903 struct DFA_states *dfas;
905 assert (poset_chunk > 10);
909 poset = mk_SetType (poset_chunk);
912 dfa_trav (parse_info->root);
914 if (debug_dfa_followpos)
916 init_DFA_states (&dfas, poset, STATE_HASH);
928 struct DFA *dfa_init (void)
932 dfa = imalloc (sizeof(*dfa));
933 dfa->parse_info = dfa_parse_init ();
934 dfa->state_info = NULL;
939 int dfa_parse (struct DFA *dfa, char **pattern)
944 do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top);
947 if ( !dfa->parse_info->root )
948 dfa->parse_info->root = top;
955 n->u.p[0] = dfa->parse_info->root;
957 dfa->parse_info->root = n;
962 void dfa_mkstate (struct DFA *dfa)
966 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
967 dfa->no_states = dfa->state_info->no;
968 dfa->states = dfa->state_info->sortarray;
969 rm_dfa_parse (&dfa->parse_info);
972 void dfa_delete (struct DFA **dfap)
976 if ((*dfap)->parse_info)
977 rm_dfa_parse (&(*dfap)->parse_info);
978 if ((*dfap)->state_info)
979 rm_DFA_states (&(*dfap)->state_info);