1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.4 2006-05-03 13:47:35 heikki Exp $
6 * This is a simple NFA-based system for character set normalizing
7 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
8 * characters at a time, so as to keep the size small.
10 * All characters are internally handled as unsigned 32-bit ints, so
11 * this works well with unicode. Translating to and from utf-8 is trivial, if
14 * The NFA stores a translation-thing in the terminal nodes. It does not
15 * concern itself with what that thing is, for us it is juts a void*
17 * The module consists of two parts: Building the NFA, and translating
29 typedef struct yaz_nfa_backref_info yaz_backref_info;
31 struct yaz_nfa_backref_info {
38 int nstates; /* how many states do we have */
39 int nbackrefs; /* how many backrefs do we have */
40 yaz_nfa_state *laststate; /* points to the last in the circular list */
41 yaz_nfa_state *firststate; /* the initial state */
42 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
43 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
44 int lastmatch; /* the result of last match */
47 struct yaz_nfa_state {
48 int num; /* state number. used for resoving ambiguities, and for dumping */
49 void *result; /* signals a terminal state, and gives the result */
50 yaz_nfa_transition *lasttrans; /* circular list of transitions */
51 yaz_nfa_state *next; /* Circular linked list */
52 int backref_start; /* which backreference starts here. 0 if none */
53 int backref_end; /* which backreference ends here. 0 if none */
56 struct yaz_nfa_transition {
57 yaz_nfa_state *to_state; /* where to */
58 yaz_nfa_char range_start;
59 yaz_nfa_char range_end;
60 yaz_nfa_transition *next; /* a linked list of them */
63 /* a range from 1 down to 0 is impossible, and is used to denote an */
64 /* empty (epsilon) transition */
68 /* a limit for the number of empty transitions we can have in a row before */
69 /* we declare this a loop */
70 #define LOOP_LIMIT 100
76 * Initializing and destroying whole nfa's
79 yaz_nfa *yaz_nfa_init() {
80 NMEM my_nmem = nmem_create();
81 yaz_nfa *n=nmem_malloc(my_nmem, sizeof(yaz_nfa));
89 n->lastmatch= YAZ_NFA_NOMATCH;
93 void yaz_nfa_destroy(yaz_nfa *n) {
94 nmem_destroy(n->nmem);
99 * Low-level interface to building the NFA
102 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
103 yaz_nfa_state *s = nmem_malloc(n->nmem,sizeof(yaz_nfa_state));
104 s->num = (n->nstates)++;
110 s->next=n->laststate->next;
111 n->laststate->next=s;
113 } else { /* first state */
121 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s,void *result) {
122 if ((s->result)&&result)
128 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
134 int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
138 if (s->backref_start)
140 s->backref_start=backref_number;
141 if (n->nbackrefs<=backref_number) {
142 n->nbackrefs=backref_number+1;
145 /* clear them just in case we have already matched on */
146 /* with this nfa, and created a too small backref table */
147 /* we will reallocate when matching. */
152 if (n->nbackrefs<backref_number)
153 return 2; /* can't end a backref that has not been started */
154 s->backref_end=backref_number;
159 int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
164 return s->backref_start;
166 return s->backref_end;
169 void yaz_nfa_add_transition(yaz_nfa *n,
170 yaz_nfa_state *from_state,
171 yaz_nfa_state *to_state,
172 yaz_nfa_char range_start,
173 yaz_nfa_char range_end) {
174 yaz_nfa_transition *t=nmem_malloc(n->nmem,sizeof(yaz_nfa_transition));
175 t->range_start=range_start;
176 t->range_end=range_end;
177 t->to_state=to_state;
178 if (from_state->lasttrans) {
179 t->next= from_state->lasttrans->next;
180 from_state->lasttrans->next=t;
181 from_state->lasttrans=t;
182 } else { /* first trans */
183 from_state->lasttrans=t;
188 void yaz_nfa_add_empty_transition( yaz_nfa *n,
189 yaz_nfa_state *from_state,
190 yaz_nfa_state *to_state) {
191 yaz_nfa_add_transition(n,from_state,to_state,
192 EMPTY_START,EMPTY_END);
196 * Medium-level interface
199 /* Finds a transition from s where the range is exactly c..c */
200 /* There should only be one such transition */
201 static yaz_nfa_state *find_single_trans(
203 yaz_nfa_char range_start,
204 yaz_nfa_char range_end) {
205 yaz_nfa_transition *t=s->lasttrans;
210 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
212 } while (t != s->lasttrans );
217 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
219 yaz_nfa_char range_start,
220 yaz_nfa_char range_end) {
221 yaz_nfa_state *nextstate;
222 if (!s) /* default to top-level of the nfa */
224 nextstate=find_single_trans(s,range_start,range_end);
226 nextstate = yaz_nfa_add_state(n);
227 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
232 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
235 yaz_nfa_state *nextstate;
236 if (!s) /* default to top-level of the nfa */
238 nextstate=find_single_trans(s,*seq,*seq);
241 if (!*seq) /* whole sequence matched */
244 return yaz_nfa_add_sequence(n, nextstate, seq);
245 } else { /* no next state, build the rest */
247 s=yaz_nfa_add_range(n,s, *seq, *seq);
252 return 0; /* compiler shut up, we return somewhere above */
263 yaz_nfa_char *longest;
267 int empties; /* count how many consecutive empty transitions */
270 /* Finds the longest match. In case of ties, chooses the one with the
271 * lowest numbered state. Keep track of the back references. Recursively
272 * traverses the NFA. Keeps track of the best hit it has found. */
274 static void match_state(
276 yaz_nfa_char *inchar,
278 struct matcher *m ) {
279 yaz_nfa_transition *t=s->lasttrans;
280 if (s->backref_start)
281 m->n->curr_backrefs[s->backref_start].start=inchar;
283 m->n->curr_backrefs[s->backref_end].end=inchar;
288 if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
290 if (t->range_start!=t->range_end){
291 /* backref 0 is special: the last range operation */
292 m->n->curr_backrefs[0].start=inchar;
293 m->n->curr_backrefs[0].end=inchar;
295 match_state(t->to_state, inchar+1,incharsleft-1,m);
296 /* yes, descent to all matching nodes, even if overrun, */
297 /* since we can find a better one later */
298 } else if (( t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
299 if ( m->empties++ > LOOP_LIMIT )
300 m->errorcode= YAZ_NFA_LOOP;
302 match_state(t->to_state, inchar,incharsleft,m);
304 } while (t != s->lasttrans );
306 m->errorcode=YAZ_NFA_OVERRUN;
309 if (s->result) { /* terminal node */
310 if ( (m->longest < inchar) || /* longer result */
311 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
312 /* or as long, but with lower node number. Still better */
317 if (m->n->curr_backrefs)
318 for (i=0; i<m->n->nbackrefs;i++) {
319 m->n->best_backrefs[i]=m->n->curr_backrefs[i];
323 if (s->backref_start)
324 m->n->curr_backrefs[s->backref_start].start=0;
326 m->n->curr_backrefs[s->backref_end].end=0;
327 m->n->curr_backrefs[0].start=0;
328 m->n->curr_backrefs[0].end=0;
331 int yaz_nfa_match(yaz_nfa *n,
332 yaz_nfa_char **inbuff,
338 if (!n->firststate) {
339 n->lastmatch=YAZ_NFA_NOMATCH;
344 m.bestnode=n->nstates;
347 sz=sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
348 if (!n->curr_backrefs) {
349 n->curr_backrefs=nmem_malloc( n->nmem, sz);
350 n->best_backrefs=nmem_malloc( n->nmem, sz);
352 for (i=0;i<n->nbackrefs;i++) {
353 n->curr_backrefs[i].start=0;
354 n->curr_backrefs[i].end=0;
355 n->best_backrefs[i].start=0;
356 n->best_backrefs[i].end=0;
359 match_state(n->firststate, *inbuff, incharsleft, &m);
364 n->lastmatch=m.errorcode;
366 n->lastmatch= YAZ_NFA_SUCCESS;
369 n->lastmatch=YAZ_NFA_NOMATCH;
374 int yaz_nfa_get_backref( yaz_nfa *n,
376 yaz_nfa_char **start,
377 yaz_nfa_char **end) {
378 if (backref_no>=n->nbackrefs)
382 if (n->lastmatch== YAZ_NFA_NOMATCH)
383 return 1; /* accept other errors, they return partial matches*/
385 *start=n->best_backrefs[backref_no].start;
386 *end=n->best_backrefs[backref_no].end;
395 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
398 return n->firststate;
401 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
411 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
418 if ( (t->range_start <= ' ') || (t->range_start>'z'))
420 if ( (t->range_end <= ' ') || (t->range_end>'z'))
422 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
425 fprintf(F," -> [%d] %s '%c' %x - '%c' %x \n",
431 static void dump_state(FILE *F, yaz_nfa_state *s,
432 char *(*strfunc)(void *) ) {
433 yaz_nfa_transition *t;
434 char *resultstring="";
437 resultstring=(*strfunc)(s->result);
440 resultstring=s->result;
442 fprintf(F," state [%d] %s %s",
443 s->num, s->result?"(FINAL)":"",resultstring );
444 if (s->backref_start) {
445 fprintf(F," start-%d",s->backref_start);
447 if (s->backref_end) {
448 fprintf(F," end-%d",s->backref_end);
453 fprintf(F," (no transitions)\n");
458 } while (t != s->lasttrans);
463 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
464 char *(*strfunc)(void *) ) {
466 if (!F) /* lazy programmers can just pass 0 for F */
468 fprintf(F,"The NFA has %d states and %d backrefs\n",
469 n->nstates, n->nbackrefs);
474 dump_state(F,s, strfunc);
475 } while (s != n->laststate);
484 * indent-tabs-mode: nil
486 * vim: shiftwidth=4 tabstop=8 expandtab