2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.9 1995-12-06 12:24:58 adam
8 * Removed verbatim mode code.
10 * Revision 1.8 1995/12/06 09:09:58 adam
11 * Work on left and right anchors.
13 * Revision 1.7 1995/11/27 09:23:02 adam
14 * New berbatim hook in regular expressions. "[]n ..".
16 * Revision 1.6 1995/10/16 09:31:25 adam
19 * Revision 1.5 1995/10/02 15:17:58 adam
20 * Bug fix in dfa_delete.
22 * Revision 1.4 1995/09/28 09:18:52 adam
23 * Removed various preprocessor defines.
25 * Revision 1.3 1995/09/04 12:33:26 adam
26 * Various cleanup. YAZ util used instead.
28 * Revision 1.2 1995/01/25 11:30:50 adam
29 * Simple error reporting when parsing regular expressions.
30 * Memory usage reduced.
32 * Revision 1.1 1995/01/24 16:02:52 adam
33 * New private header file in dfa module (dfap.h).
34 * Module no longer uses yacc to parse regular expressions.
56 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
57 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
58 /* a character in range [ch[0]..ch[1]] */
59 /* otherwise ch[0] represents */
60 /* accepting no (-acceptno) */
62 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
63 unsigned nullable : 1;
64 Set firstpos; /* first positions */
65 Set lastpos; /* last positions */
68 struct Tblock { /* Tnode bucket (block) */
69 struct Tblock *next; /* pointer to next bucket */
70 struct Tnode *tarray; /* array of nodes in bucket */
74 #define STATE_HASH 199
75 #define POSET_CHUNK 100
77 int debug_dfa_trav = 0;
78 int debug_dfa_tran = 0;
79 int debug_dfa_followpos = 0;
82 static struct DFA_parse *parse_info = NULL;
85 static int inside_string;
86 static const unsigned char *expr_ptr;
87 static unsigned short *ctrl_chars;
88 static struct Tnode **posar;
91 static Set *followpos;
93 static struct Tnode *mk_Tnode (void);
94 static struct Tnode *mk_Tnode_cset (BSet charset);
95 static void term_Tnode (void);
101 mk_dfa_tran (struct DFA_states *dfas),
102 add_follow (Set lastpos, Set firstpos),
103 dfa_trav (struct Tnode *n),
104 init_followpos (void),
105 pr_tran (struct DFA_states *dfas),
106 pr_verbose (struct DFA_states *dfas),
116 *str_char (unsigned c);
132 static int lookahead;
133 static unsigned look_ch;
134 static BSet look_chars;
136 static struct Tnode *expr_1 (void),
141 static struct Tnode *expr_1 (void)
143 struct Tnode *t1, *t2, *tn;
145 if (!(t1 = expr_2 ()))
147 while (lookahead == L_ALT)
150 if (!(t2 = expr_2 ()))
162 static struct Tnode *expr_2 (void)
164 struct Tnode *t1, *t2, *tn;
166 if (!(t1 = expr_3 ()))
168 while (lookahead == L_WILD ||
169 lookahead == L_ANYZ ||
170 lookahead == L_ANY ||
172 lookahead == L_CHAR ||
173 lookahead == L_CHARS)
175 if (!(t2 = expr_3 ()))
187 static struct Tnode *expr_3 (void)
189 struct Tnode *t1, *tn;
191 if (!(t1 = expr_4 ()))
193 if (lookahead == L_CLOS0)
201 else if (lookahead == L_CLOS1)
209 else if (lookahead == L_QUEST)
215 tn->u.p[1] = mk_Tnode();
216 tn->u.p[1]->pos = EPSILON;
222 static struct Tnode *expr_4 (void)
230 if (!(t1 = expr_1 ()))
232 if (lookahead == L_RP)
239 t1->pos = ++parse_info->position;
240 t1->u.ch[1] = t1->u.ch[0] = look_ch;
244 t1 = mk_Tnode_cset (look_chars);
248 t1 = mk_Tnode_cset (parse_info->anyset);
254 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
255 t1->u.p[1] = mk_Tnode();
256 t1->u.p[1]->pos = EPSILON;
262 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
270 static void do_parse (struct DFA_parse *dfap, char **s,
271 const unsigned short *cc, struct Tnode **tnp)
275 struct Tnode *t1, *t2, *tn;
277 for (i=0; cc[i]; i +=2)
279 ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
280 for (i=0; (ctrl_chars[i] = cc[i]); i ++)
284 ctrl_chars[i] = L_LP;
287 ctrl_chars[i] = L_RP;
290 ctrl_chars[i] = L_ANY;
293 ctrl_chars[i] = L_ALT;
296 ctrl_chars[i] = L_QUEST;
299 ctrl_chars[i] = L_CLOS1;
302 ctrl_chars[i] = L_CLOS0;
305 ctrl_chars[i] = L_END;
308 ctrl_chars[i] = L_START;
311 ctrl_chars[i] = L_ANY;
314 ctrl_chars[i] = L_ANYZ;
317 ctrl_chars[i] = L_WILD;
324 expr_ptr = (unsigned char *) *s;
328 if (lookahead == L_START)
331 t2->pos = ++parse_info->position;
332 t2->u.ch[1] = t2->u.ch[0] = '\n';
345 if (lookahead == L_END && t1)
348 t2->pos = ++parse_info->position;
349 t2->u.ch[1] = t2->u.ch[0] = '\n';
360 if (lookahead == 0 && t1)
363 t2->pos = ++parse_info->position;
364 t2->u.ch[0] = -(++parse_info->rule);
365 t2->u.ch[1] = anchor_flag;
376 if (lookahead == L_RP)
377 err_code = DFA_ERR_RP;
378 else if (lookahead == L_LP)
379 err_code = DFA_ERR_LP;
381 err_code = DFA_ERR_SYNTAX;
384 *s = (char *) expr_ptr;
388 static int nextchar (int *esc)
391 if (*expr_ptr == '\0' || isspace(*expr_ptr))
393 else if (*expr_ptr != '\\')
422 static int nextchar_set (int *esc)
424 if (*expr_ptr == ' ')
429 return nextchar (esc);
432 static int read_charset (void)
434 int i, ch0, ch1, esc0, esc1, cc = 0;
435 look_chars = mk_BSet (&parse_info->charset);
436 res_BSet (parse_info->charset, look_chars);
438 ch0 = nextchar_set (&esc0);
439 if (!esc0 && ch0 == '^')
442 ch0 = nextchar_set (&esc0);
446 if (!esc0 && ch0 == ']')
448 add_BSet (parse_info->charset, look_chars, ch0);
449 ch1 = nextchar_set (&esc1);
450 if (!esc1 && ch1 == '-')
452 if ((ch1 = nextchar_set (&esc1)) == 0)
454 if (!esc1 && ch1 == ']')
456 add_BSet (parse_info->charset, look_chars, '-');
459 for (i=ch0; ++i<=ch1;)
460 add_BSet (parse_info->charset, look_chars, i);
461 ch0 = nextchar_set (&esc0);
470 com_BSet (parse_info->charset, look_chars);
474 unsigned short dfa_thompson_chars[] =
488 unsigned short dfa_ccl_chars[] =
496 static int lex_sub(void)
499 const unsigned short *cc;
500 while ((look_ch = nextchar (&esc)) != 0)
505 inside_string = !inside_string;
507 else if (esc || inside_string)
509 else if (look_ch == '[')
510 return read_charset();
511 else if (look_ch == ' ')
515 for (cc = ctrl_chars; *cc; cc += 2)
523 static void lex (void)
525 lookahead = lex_sub ();
528 static const char *str_char (unsigned c)
545 sprintf (s+1, "x%02x", c);
567 static void out_char (int c)
569 printf ("%s", str_char (c));
572 static void term_Tnode (void)
574 struct Tblock *t, *t1;
575 for (t = parse_info->start; (t1 = t);)
583 static struct Tnode *mk_Tnode (void)
587 if (parse_info->use_Tnode == parse_info->max_Tnode)
589 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
590 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
593 if (parse_info->use_Tnode == 0)
594 parse_info->start = tnew;
596 parse_info->end->next = tnew;
597 parse_info->end = tnew;
599 parse_info->max_Tnode += TADD;
601 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
604 struct Tnode *mk_Tnode_cset (BSet charset)
606 struct Tnode *tn1, *tn0 = mk_Tnode();
607 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
613 tn0->pos = ++parse_info->position;
614 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
616 tn0->u.ch[1] = parse_info->charset->size;
619 tn0->u.ch[1] = ch1-1;
620 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
626 tn1 = tn0->u.p[1] = mk_Tnode();
628 tn1->pos = ++(parse_info->position);
629 if ((ch1 = travi_BSet (parse_info->charset, charset,
632 tn1->u.ch[1] = parse_info->charset->size;
635 tn1->u.ch[1] = ch1-1;
642 static void del_followpos (void)
647 static void init_pos (void)
649 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
650 * (1+parse_info->position));
653 static void del_pos (void)
658 static void add_follow (Set lastpos, Set firstpos)
662 followpos[lastpos->value] =
663 union_Set (poset, followpos[lastpos->value], firstpos);
664 lastpos = lastpos->next;
668 static void dfa_trav (struct Tnode *n)
673 dfa_trav (n->u.p[0]);
674 dfa_trav (n->u.p[1]);
675 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
676 n->firstpos = mk_Set (poset);
677 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
678 if (n->u.p[0]->nullable)
679 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
680 n->lastpos = mk_Set (poset);
681 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
682 if (n->u.p[1]->nullable)
683 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
685 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
687 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
688 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
689 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
690 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
695 dfa_trav (n->u.p[0]);
696 dfa_trav (n->u.p[1]);
697 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
699 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
700 n->u.p[1]->firstpos);
701 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
703 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
704 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
705 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
706 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
711 dfa_trav (n->u.p[0]);
712 n->nullable = n->u.p[0]->nullable;
713 n->firstpos = n->u.p[0]->firstpos;
714 n->lastpos = n->u.p[0]->lastpos;
715 add_follow (n->lastpos, n->firstpos);
720 dfa_trav (n->u.p[0]);
722 n->firstpos = n->u.p[0]->firstpos;
723 n->lastpos = n->u.p[0]->lastpos;
724 add_follow (n->lastpos, n->firstpos);
730 n->lastpos = mk_Set (poset);
731 n->firstpos = mk_Set (poset);
738 n->firstpos = mk_Set (poset);
739 n->firstpos = add_Set (poset, n->firstpos, n->pos);
740 n->lastpos = mk_Set (poset);
741 n->lastpos = add_Set (poset, n->lastpos, n->pos);
744 printf ("#%d", -n->u.ch[0]);
745 else if (n->u.ch[1] > n->u.ch[0])
748 out_char (n->u.ch[0]);
749 if (n->u.ch[1] > n->u.ch[0]+1)
751 out_char (n->u.ch[1]);
755 out_char (n->u.ch[0]);
759 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
760 printf (" firstpos :");
761 pr_Set (poset, n->firstpos);
762 printf (" lastpos :");
763 pr_Set (poset, n->lastpos);
767 static void init_followpos (void)
771 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
772 for (i = parse_info->position+1; --i >= 0; fa++)
773 *fa = mk_Set (poset);
776 static void mk_dfa_tran (struct DFA_states *dfas)
783 struct DFA_state *dfa_from, *dfa_to;
786 max_char = parse_info->charset->size;
787 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
789 tran_set = cp_Set (poset, parse_info->root->firstpos);
790 i = add_DFA_state (dfas, &tran_set, &dfa_from);
792 dfa_from->rule_no = 0;
793 while ((dfa_from = get_DFA_state (dfas)))
797 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
798 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
799 *pos_i++ = tran_set->value;
800 else if (c < 0 && (i == 0 || c > i))
803 dfa_from->rule_no = -i;
805 for (char_1 = 0; char_1 <= max_char; char_1++)
808 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
809 if (posar[i]->u.ch[1] >= char_1
810 && (c=posar[i]->u.ch[0]) < char_0)
816 if (char_0 > max_char)
821 tran_set = mk_Set (poset);
822 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
824 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
825 char_1 = c - 1; /* forward chunk */
826 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
827 char_1 = c; /* backward chunk */
828 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
829 tran_set = union_Set (poset, tran_set, followpos[i]);
833 add_DFA_state (dfas, &tran_set, &dfa_to);
834 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
839 sort_DFA_states (dfas);
842 static void pr_tran (struct DFA_states *dfas)
845 struct DFA_tran *tran;
846 int prev_no, i, c, no;
848 for (no=0; no < dfas->no; ++no)
850 s = dfas->sortarray[no];
851 assert (s->no == no);
852 printf ("DFA state %d", no);
854 printf ("#%d", s->rule_no);
856 pr_Set (poset, s->set);
858 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
860 if (prev_no != tran->to)
864 printf (" goto %d on [", tran->to);
867 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
868 printf ("%s", str_char(c));
876 static void pr_verbose (struct DFA_states *dfas)
880 printf ("%d/%d tree nodes used, %d bytes each\n",
881 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
882 k = inf_BSetHandle (parse_info->charset, &i, &j);
883 printf ("%ld/%ld character sets, %d bytes each\n",
884 i/k, j/k, k*sizeof(BSetWord));
885 k = inf_SetType (poset, &i, &j);
886 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
887 printf ("%d DFA states\n", dfas->no);
890 static void pr_followpos (void)
893 printf ("\nfollowsets:\n");
894 for (i=1; i <= parse_info->position; i++)
897 pr_Set (poset, followpos[i]);
900 if (posar[i]->u.ch[0] < 0)
901 printf ("#%d", -posar[i]->u.ch[0]);
902 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
905 out_char (posar[i]->u.ch[0]);
906 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
908 out_char (posar[i]->u.ch[1]);
912 out_char (posar[i]->u.ch[0]);
918 static struct DFA_parse *dfa_parse_init (void)
920 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
922 parse_info->charset = mk_BSetHandle (255, 20);
923 parse_info->position = 0;
924 parse_info->rule = 0;
925 parse_info->root = NULL;
927 parse_info->anyset = mk_BSet (&parse_info->charset);
928 res_BSet (parse_info->charset, parse_info->anyset);
929 add_BSet (parse_info->charset, parse_info->anyset, '\n');
930 com_BSet (parse_info->charset, parse_info->anyset);
931 parse_info->use_Tnode = parse_info->max_Tnode = 0;
935 static void rm_dfa_parse (struct DFA_parse **dfap)
940 rm_BSetHandle (&parse_info->charset);
945 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
947 struct DFA_states *dfas;
949 assert (poset_chunk > 10);
953 poset = mk_SetType (poset_chunk);
956 dfa_trav (parse_info->root);
958 if (debug_dfa_followpos)
960 init_DFA_states (&dfas, poset, STATE_HASH);
972 struct DFA *dfa_init (void)
976 dfa = imalloc (sizeof(*dfa));
977 dfa->parse_info = dfa_parse_init ();
978 dfa->state_info = NULL;
983 int dfa_parse (struct DFA *dfa, char **pattern)
988 do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top);
991 if ( !dfa->parse_info->root )
992 dfa->parse_info->root = top;
999 n->u.p[0] = dfa->parse_info->root;
1001 dfa->parse_info->root = n;
1006 void dfa_mkstate (struct DFA *dfa)
1010 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1011 dfa->no_states = dfa->state_info->no;
1012 dfa->states = dfa->state_info->sortarray;
1013 rm_dfa_parse (&dfa->parse_info);
1016 void dfa_delete (struct DFA **dfap)
1020 if ((*dfap)->parse_info)
1021 rm_dfa_parse (&(*dfap)->parse_info);
1022 if ((*dfap)->state_info)
1023 rm_DFA_states (&(*dfap)->state_info);