2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.10 1996-01-08 09:09:17 adam
8 * Function dfa_parse got 'const' string argument.
9 * New functions to define char mappings made public.
11 * Revision 1.9 1995/12/06 12:24:58 adam
12 * Removed verbatim mode code.
14 * Revision 1.8 1995/12/06 09:09:58 adam
15 * Work on left and right anchors.
17 * Revision 1.7 1995/11/27 09:23:02 adam
18 * New berbatim hook in regular expressions. "[]n ..".
20 * Revision 1.6 1995/10/16 09:31:25 adam
23 * Revision 1.5 1995/10/02 15:17:58 adam
24 * Bug fix in dfa_delete.
26 * Revision 1.4 1995/09/28 09:18:52 adam
27 * Removed various preprocessor defines.
29 * Revision 1.3 1995/09/04 12:33:26 adam
30 * Various cleanup. YAZ util used instead.
32 * Revision 1.2 1995/01/25 11:30:50 adam
33 * Simple error reporting when parsing regular expressions.
34 * Memory usage reduced.
36 * Revision 1.1 1995/01/24 16:02:52 adam
37 * New private header file in dfa module (dfap.h).
38 * Module no longer uses yacc to parse regular expressions.
60 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
61 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
62 /* a character in range [ch[0]..ch[1]] */
63 /* otherwise ch[0] represents */
64 /* accepting no (-acceptno) */
66 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
67 unsigned nullable : 1;
68 Set firstpos; /* first positions */
69 Set lastpos; /* last positions */
72 struct Tblock { /* Tnode bucket (block) */
73 struct Tblock *next; /* pointer to next bucket */
74 struct Tnode *tarray; /* array of nodes in bucket */
78 #define STATE_HASH 199
79 #define POSET_CHUNK 100
81 int debug_dfa_trav = 0;
82 int debug_dfa_tran = 0;
83 int debug_dfa_followpos = 0;
86 static struct DFA_parse *parse_info = NULL;
89 static int inside_string;
90 static const unsigned char *expr_ptr;
91 static struct Tnode **posar;
94 static Set *followpos;
96 static struct Tnode *mk_Tnode (void);
97 static struct Tnode *mk_Tnode_cset (BSet charset);
98 static void term_Tnode (void);
101 del_followpos (void),
104 mk_dfa_tran (struct DFA_states *dfas),
105 add_follow (Set lastpos, Set firstpos),
106 dfa_trav (struct Tnode *n),
107 init_followpos (void),
108 pr_tran (struct DFA_states *dfas),
109 pr_verbose (struct DFA_states *dfas),
119 *str_char (unsigned c);
135 static int lookahead;
136 static unsigned look_ch;
137 static BSet look_chars;
139 static struct Tnode *expr_1 (void),
144 static struct Tnode *expr_1 (void)
146 struct Tnode *t1, *t2, *tn;
148 if (!(t1 = expr_2 ()))
150 while (lookahead == L_ALT)
153 if (!(t2 = expr_2 ()))
165 static struct Tnode *expr_2 (void)
167 struct Tnode *t1, *t2, *tn;
169 if (!(t1 = expr_3 ()))
171 while (lookahead == L_WILD ||
172 lookahead == L_ANYZ ||
173 lookahead == L_ANY ||
175 lookahead == L_CHAR ||
176 lookahead == L_CHARS)
178 if (!(t2 = expr_3 ()))
190 static struct Tnode *expr_3 (void)
192 struct Tnode *t1, *tn;
194 if (!(t1 = expr_4 ()))
196 if (lookahead == L_CLOS0)
204 else if (lookahead == L_CLOS1)
212 else if (lookahead == L_QUEST)
218 tn->u.p[1] = mk_Tnode();
219 tn->u.p[1]->pos = EPSILON;
225 static struct Tnode *expr_4 (void)
233 if (!(t1 = expr_1 ()))
235 if (lookahead == L_RP)
242 t1->pos = ++parse_info->position;
243 t1->u.ch[1] = t1->u.ch[0] = look_ch;
247 t1 = mk_Tnode_cset (look_chars);
251 t1 = mk_Tnode_cset (parse_info->anyset);
257 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
258 t1->u.p[1] = mk_Tnode();
259 t1->u.p[1]->pos = EPSILON;
265 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
273 static void do_parse (struct DFA_parse *dfap, const char **s, struct Tnode **tnp)
276 int start_anchor_flag = 0;
277 struct Tnode *t1, *t2, *tn;
281 expr_ptr = (unsigned char *) *s;
285 if (lookahead == L_START)
287 start_anchor_flag = 1;
299 if (lookahead == L_END && t1)
302 t2->pos = ++parse_info->position;
303 t2->u.ch[1] = t2->u.ch[0] = '\n';
314 if (lookahead == 0 && t1)
317 t2->pos = ++parse_info->position;
318 t2->u.ch[0] = -(++parse_info->rule);
319 t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
330 if (lookahead == L_RP)
331 err_code = DFA_ERR_RP;
332 else if (lookahead == L_LP)
333 err_code = DFA_ERR_LP;
335 err_code = DFA_ERR_SYNTAX;
338 *s = (char *) expr_ptr;
341 static int nextchar (int *esc)
344 if (*expr_ptr == '\0')
346 else if (*expr_ptr != '\\')
375 static int nextchar_set (int *esc)
377 if (*expr_ptr == ' ')
382 return nextchar (esc);
385 static int read_charset (void)
387 int i, ch0, ch1, esc0, esc1, cc = 0;
388 look_chars = mk_BSet (&parse_info->charset);
389 res_BSet (parse_info->charset, look_chars);
391 ch0 = nextchar_set (&esc0);
392 if (!esc0 && ch0 == '^')
395 ch0 = nextchar_set (&esc0);
399 if (!esc0 && ch0 == ']')
401 add_BSet (parse_info->charset, look_chars, ch0);
402 ch1 = nextchar_set (&esc1);
403 if (!esc1 && ch1 == '-')
405 if ((ch1 = nextchar_set (&esc1)) == 0)
407 if (!esc1 && ch1 == ']')
409 add_BSet (parse_info->charset, look_chars, '-');
412 for (i=ch0; ++i<=ch1;)
413 add_BSet (parse_info->charset, look_chars, i);
414 ch0 = nextchar_set (&esc0);
423 com_BSet (parse_info->charset, look_chars);
427 static int lex_sub(void)
430 while ((look_ch = nextchar (&esc)) != 0)
435 inside_string = !inside_string;
437 else if (esc || inside_string)
439 else if (look_ch == '[')
440 return read_charset();
445 logf (LOG_DEBUG, "xxxx / xxx");
446 for (cc = parse_info->charMap; *cc; cc += 2)
458 static void lex (void)
460 lookahead = lex_sub ();
463 static const char *str_char (unsigned c)
480 sprintf (s+1, "x%02x", c);
502 static void out_char (int c)
504 printf ("%s", str_char (c));
507 static void term_Tnode (void)
509 struct Tblock *t, *t1;
510 for (t = parse_info->start; (t1 = t);)
518 static struct Tnode *mk_Tnode (void)
522 if (parse_info->use_Tnode == parse_info->max_Tnode)
524 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
525 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
528 if (parse_info->use_Tnode == 0)
529 parse_info->start = tnew;
531 parse_info->end->next = tnew;
532 parse_info->end = tnew;
534 parse_info->max_Tnode += TADD;
536 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
539 struct Tnode *mk_Tnode_cset (BSet charset)
541 struct Tnode *tn1, *tn0 = mk_Tnode();
542 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
548 tn0->pos = ++parse_info->position;
549 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
551 tn0->u.ch[1] = parse_info->charset->size;
554 tn0->u.ch[1] = ch1-1;
555 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
561 tn1 = tn0->u.p[1] = mk_Tnode();
563 tn1->pos = ++(parse_info->position);
564 if ((ch1 = travi_BSet (parse_info->charset, charset,
567 tn1->u.ch[1] = parse_info->charset->size;
570 tn1->u.ch[1] = ch1-1;
577 static void del_followpos (void)
582 static void init_pos (void)
584 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
585 * (1+parse_info->position));
588 static void del_pos (void)
593 static void add_follow (Set lastpos, Set firstpos)
597 followpos[lastpos->value] =
598 union_Set (poset, followpos[lastpos->value], firstpos);
599 lastpos = lastpos->next;
603 static void dfa_trav (struct Tnode *n)
608 dfa_trav (n->u.p[0]);
609 dfa_trav (n->u.p[1]);
610 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
611 n->firstpos = mk_Set (poset);
612 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
613 if (n->u.p[0]->nullable)
614 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
615 n->lastpos = mk_Set (poset);
616 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
617 if (n->u.p[1]->nullable)
618 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
620 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
622 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
623 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
624 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
625 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
630 dfa_trav (n->u.p[0]);
631 dfa_trav (n->u.p[1]);
632 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
634 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
635 n->u.p[1]->firstpos);
636 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
638 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
639 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
640 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
641 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
646 dfa_trav (n->u.p[0]);
647 n->nullable = n->u.p[0]->nullable;
648 n->firstpos = n->u.p[0]->firstpos;
649 n->lastpos = n->u.p[0]->lastpos;
650 add_follow (n->lastpos, n->firstpos);
655 dfa_trav (n->u.p[0]);
657 n->firstpos = n->u.p[0]->firstpos;
658 n->lastpos = n->u.p[0]->lastpos;
659 add_follow (n->lastpos, n->firstpos);
665 n->lastpos = mk_Set (poset);
666 n->firstpos = mk_Set (poset);
673 n->firstpos = mk_Set (poset);
674 n->firstpos = add_Set (poset, n->firstpos, n->pos);
675 n->lastpos = mk_Set (poset);
676 n->lastpos = add_Set (poset, n->lastpos, n->pos);
679 printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
680 else if (n->u.ch[1] > n->u.ch[0])
683 out_char (n->u.ch[0]);
684 if (n->u.ch[1] > n->u.ch[0]+1)
686 out_char (n->u.ch[1]);
690 out_char (n->u.ch[0]);
694 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
695 printf (" firstpos :");
696 pr_Set (poset, n->firstpos);
697 printf (" lastpos :");
698 pr_Set (poset, n->lastpos);
702 static void init_followpos (void)
706 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
707 for (i = parse_info->position+1; --i >= 0; fa++)
708 *fa = mk_Set (poset);
711 static void mk_dfa_tran (struct DFA_states *dfas)
718 struct DFA_state *dfa_from, *dfa_to;
721 max_char = parse_info->charset->size;
722 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
724 tran_set = cp_Set (poset, parse_info->root->firstpos);
725 i = add_DFA_state (dfas, &tran_set, &dfa_from);
727 dfa_from->rule_no = 0;
728 while ((dfa_from = get_DFA_state (dfas)))
732 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
733 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
734 *pos_i++ = tran_set->value;
739 c = posar[tran_set->value]->u.ch[1];
744 dfa_from->rule_no = -i;
745 dfa_from->rule_nno = -j;
747 for (char_1 = 0; char_1 <= max_char; char_1++)
750 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
751 if (posar[i]->u.ch[1] >= char_1
752 && (c=posar[i]->u.ch[0]) < char_0)
758 if (char_0 > max_char)
763 tran_set = mk_Set (poset);
764 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
766 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
767 char_1 = c - 1; /* forward chunk */
768 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
769 char_1 = c; /* backward chunk */
770 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
771 tran_set = union_Set (poset, tran_set, followpos[i]);
775 add_DFA_state (dfas, &tran_set, &dfa_to);
776 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
781 sort_DFA_states (dfas);
784 static void pr_tran (struct DFA_states *dfas)
787 struct DFA_tran *tran;
788 int prev_no, i, c, no;
790 for (no=0; no < dfas->no; ++no)
792 s = dfas->sortarray[no];
793 assert (s->no == no);
794 printf ("DFA state %d", no);
796 printf ("#%d", s->rule_no);
798 printf (" #%d", s->rule_nno);
801 pr_Set (poset, s->set);
803 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
805 if (prev_no != tran->to)
809 printf (" goto %d on [", tran->to);
812 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
813 printf ("%s", str_char(c));
821 static void pr_verbose (struct DFA_states *dfas)
825 printf ("%d/%d tree nodes used, %d bytes each\n",
826 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
827 k = inf_BSetHandle (parse_info->charset, &i, &j);
828 printf ("%ld/%ld character sets, %d bytes each\n",
829 i/k, j/k, k*sizeof(BSetWord));
830 k = inf_SetType (poset, &i, &j);
831 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
832 printf ("%d DFA states\n", dfas->no);
835 static void pr_followpos (void)
838 printf ("\nfollowsets:\n");
839 for (i=1; i <= parse_info->position; i++)
842 pr_Set (poset, followpos[i]);
845 if (posar[i]->u.ch[0] < 0)
846 printf ("#%d", -posar[i]->u.ch[0]);
847 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
850 out_char (posar[i]->u.ch[0]);
851 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
853 out_char (posar[i]->u.ch[1]);
857 out_char (posar[i]->u.ch[0]);
863 void dfa_parse_cmap_clean (struct DFA *d)
865 struct DFA_parse *dfa = d->parse_info;
870 dfa->charMapSize = 7;
871 dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
876 void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
878 struct DFA_parse *dfa = d->parse_info;
883 for (cp = cmap; *cp; cp += 2)
885 size = cp - cmap + 1;
886 if (size > dfa->charMapSize)
889 ifree (dfa->charMap);
890 dfa->charMapSize = size;
891 dfa->charMap = imalloc (size * sizeof(*dfa->charMap));
893 memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
896 void dfa_parse_cmap_del (struct DFA *d, int from)
898 struct DFA_parse *dfa = d->parse_info;
902 for (cc = dfa->charMap; *cc; cc += 2)
905 while ((cc[0] = cc[2]))
914 void dfa_parse_cmap_add (struct DFA *d, int from, int to)
916 struct DFA_parse *dfa = d->parse_info;
921 for (cc = dfa->charMap; *cc; cc += 2)
927 indx = cc - dfa->charMap;
928 size = dfa->charMapSize;
931 int *cn = imalloc ((size+16) * sizeof(*dfa->charMap));
932 memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
933 ifree (dfa->charMap);
935 dfa->charMapSize = size+16;
937 dfa->charMap[indx] = from;
938 dfa->charMap[indx+1] = to;
939 dfa->charMap[indx+2] = 0;
942 void dfa_parse_cmap_thompson (struct DFA *d)
944 static int thompson_chars[] =
960 dfa_parse_cmap_new (d, thompson_chars);
963 static struct DFA_parse *dfa_parse_init (void)
965 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
967 parse_info->charset = mk_BSetHandle (255, 20);
968 parse_info->position = 0;
969 parse_info->rule = 0;
970 parse_info->root = NULL;
972 parse_info->anyset = mk_BSet (&parse_info->charset);
973 res_BSet (parse_info->charset, parse_info->anyset);
974 add_BSet (parse_info->charset, parse_info->anyset, '\n');
975 com_BSet (parse_info->charset, parse_info->anyset);
976 parse_info->use_Tnode = parse_info->max_Tnode = 0;
977 parse_info->charMap = NULL;
978 parse_info->charMapSize = 0;
982 static void rm_dfa_parse (struct DFA_parse **dfap)
987 rm_BSetHandle (&parse_info->charset);
988 ifree (parse_info->charMap);
993 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
995 struct DFA_states *dfas;
997 assert (poset_chunk > 10);
1001 poset = mk_SetType (poset_chunk);
1004 assert (parse_info->root);
1005 dfa_trav (parse_info->root);
1007 if (debug_dfa_followpos)
1009 init_DFA_states (&dfas, poset, STATE_HASH);
1021 struct DFA *dfa_init (void)
1025 dfa = imalloc (sizeof(*dfa));
1026 dfa->parse_info = dfa_parse_init ();
1027 dfa->state_info = NULL;
1029 dfa_parse_cmap_thompson (dfa);
1033 int dfa_parse (struct DFA *dfa, const char **pattern)
1038 assert (dfa->parse_info);
1039 do_parse (dfa->parse_info, pattern, &top);
1042 if ( !dfa->parse_info->root )
1043 dfa->parse_info->root = top;
1050 n->u.p[0] = dfa->parse_info->root;
1052 dfa->parse_info->root = n;
1057 void dfa_mkstate (struct DFA *dfa)
1061 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1062 dfa->no_states = dfa->state_info->no;
1063 dfa->states = dfa->state_info->sortarray;
1064 rm_dfa_parse (&dfa->parse_info);
1067 void dfa_delete (struct DFA **dfap)
1071 if ((*dfap)->parse_info)
1072 rm_dfa_parse (&(*dfap)->parse_info);
1073 if ((*dfap)->state_info)
1074 rm_DFA_states (&(*dfap)->state_info);