1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.10 2006-07-04 12:59:56 heikki Exp $
9 * \brief NFA for character set normalizing
11 * This is a simple NFA-based system for character set normalizing
12 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
13 * characters at a time, so as to keep the size small.
15 * All characters are internally handled as unsigned 32-bit ints, so
16 * this works well with unicode. Translating to and from utf-8 is trivial, if
19 * The NFA stores a translation-thing in the terminal nodes. It does not
20 * concern itself with what that thing is, for us it is juts a void*
22 * The module consists of two parts: Building the NFA, and translating
36 typedef struct yaz_nfa_backref_info yaz_backref_info;
38 struct yaz_nfa_backref_info {
45 int nstates; /* how many states do we have */
46 int nbackrefs; /* how many backrefs do we have */
47 yaz_nfa_state *laststate; /* points to the last in the circular list */
48 yaz_nfa_state *firststate; /* the initial state */
49 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
50 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
51 int lastmatch; /* the result of last match */
54 struct yaz_nfa_state {
55 int num; /* state number. used for resoving ambiguities, and for dumping */
56 void *result; /* signals a terminal state, and gives the result */
57 yaz_nfa_transition *lasttrans; /* circular list of transitions */
58 yaz_nfa_state *next; /* Circular linked list */
59 int backref_start; /* which backreference starts here. 0 if none */
60 int backref_end; /* which backreference ends here. 0 if none */
63 struct yaz_nfa_transition {
64 yaz_nfa_state *to_state; /* where to */
65 yaz_nfa_char range_start;
66 yaz_nfa_char range_end;
67 yaz_nfa_transition *next; /* a linked list of them */
70 /* a range from 1 down to 0 is impossible, and is used to denote an */
71 /* empty (epsilon) transition */
75 /* a limit for the number of empty transitions we can have in a row before */
76 /* we declare this a loop */
77 #define LOOP_LIMIT 100
85 } yaz_nfa_converter_type;
87 struct yaz_nfa_converter {
88 yaz_nfa_converter *next;
89 yaz_nfa_converter_type type;
98 * Initializing and destroying whole nfa's
101 yaz_nfa *yaz_nfa_init() {
102 NMEM my_nmem = nmem_create();
103 yaz_nfa *n = nmem_malloc(my_nmem, sizeof(yaz_nfa));
105 n->nbackrefs = 1; /* we always have #0, last range match */
106 n->curr_backrefs = 0;
107 n->best_backrefs = 0;
108 n->lastmatch = YAZ_NFA_NOMATCH;
111 n->firststate = n->laststate ;
115 void yaz_nfa_destroy(yaz_nfa *n) {
117 nmem_destroy(n->nmem);
122 * Low-level interface to building the NFA
125 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
126 yaz_nfa_state *s = nmem_malloc(n->nmem, sizeof(yaz_nfa_state));
127 s->num = (n->nstates)++;
130 s->backref_start = 0;
133 s->next = n->laststate->next;
134 n->laststate->next = s;
136 } else { /* first state */
144 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s, void *result) {
145 if ((s->result)&&result)
146 return YAZ_NFA_ALREADY;
151 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
157 int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
161 if (s->backref_start)
162 return YAZ_NFA_ALREADY;
163 s->backref_start = backref_number;
164 if (n->nbackrefs<=backref_number) {
165 n->nbackrefs = backref_number+1;
166 n->curr_backrefs = 0;
167 n->best_backrefs = 0;
168 /* clear them just in case we have already matched on */
169 /* with this nfa, and created a too small backref table */
170 /* we will reallocate when matching. */
174 return YAZ_NFA_ALREADY;
175 if (n->nbackrefs<=backref_number)
176 return YAZ_NFA_NOSTART;
177 s->backref_end = backref_number;
182 int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
187 return s->backref_start;
189 return s->backref_end;
192 void yaz_nfa_add_transition(yaz_nfa *n,
193 yaz_nfa_state *from_state,
194 yaz_nfa_state *to_state,
195 yaz_nfa_char range_start,
196 yaz_nfa_char range_end) {
197 yaz_nfa_transition *t = nmem_malloc(n->nmem, sizeof(yaz_nfa_transition));
199 from_state = n->firststate;
200 t->range_start = range_start;
201 t->range_end = range_end;
202 t->to_state = to_state;
203 if (from_state->lasttrans) {
204 t->next= from_state->lasttrans->next;
205 from_state->lasttrans->next = t;
206 from_state->lasttrans = t;
207 } else { /* first trans */
208 from_state->lasttrans = t;
213 void yaz_nfa_add_empty_transition( yaz_nfa *n,
214 yaz_nfa_state *from_state,
215 yaz_nfa_state *to_state) {
216 yaz_nfa_add_transition(n, from_state, to_state,
217 EMPTY_START, EMPTY_END);
221 * Medium-level interface
224 /* Finds a transition from s where the range is exactly c..c */
225 /* There should only be one such transition */
226 static yaz_nfa_state *find_single_trans(
228 yaz_nfa_char range_start,
229 yaz_nfa_char range_end) {
230 yaz_nfa_transition *t = s->lasttrans;
235 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
237 } while (t != s->lasttrans );
242 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
244 yaz_nfa_char range_start,
245 yaz_nfa_char range_end) {
246 yaz_nfa_state *nextstate=0;
247 if (!s) /* default to top-level of the nfa */
250 nextstate = find_single_trans(s, range_start, range_end);
252 s = yaz_nfa_add_state(n); /* create initial state */
254 nextstate = yaz_nfa_add_state(n);
255 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
260 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
264 yaz_nfa_state *nextstate=0;
265 if (!s) /* default to top-level of the nfa */
268 nextstate = find_single_trans(s, *seq, *seq);
272 if (!seq_len) /* whole sequence matched */
275 return yaz_nfa_add_sequence(n, nextstate, seq,seq_len);
276 } else { /* no next state, build the rest */
278 s = yaz_nfa_add_range(n, s, *seq, *seq);
284 return 0; /* compiler shut up, we return somewhere above */
295 yaz_nfa_char *longest;
299 int empties; /* count how many consecutive empty transitions */
302 /* Finds the longest match. In case of ties, chooses the one with the
303 * lowest numbered state. Keep track of the back references. Recursively
304 * traverses the NFA. Keeps track of the best hit it has found. */
306 static void match_state(
308 yaz_nfa_char *prevmatch,
309 yaz_nfa_char *inchar,
311 struct matcher *m ) {
312 yaz_nfa_transition *t = s->lasttrans;
313 if (s->backref_start)
314 m->n->curr_backrefs[s->backref_start].start = inchar;
316 m->n->curr_backrefs[s->backref_end].end = prevmatch;
321 if ( (( t->range_start <= *inchar ) &&
322 ( t->range_end >= *inchar )) ){
324 if (t->range_start!=t->range_end){
325 /* backref 0 is special: the last range operation */
326 m->n->curr_backrefs[0].start = inchar;
327 m->n->curr_backrefs[0].end = inchar;
329 match_state(t->to_state, inchar,
330 inchar+1, incharsleft-1, m);
331 } else if (( t->range_start==EMPTY_START) &&
332 (t->range_end==EMPTY_END)) {
333 if ( m->empties++ > LOOP_LIMIT )
334 m->errorcode= YAZ_NFA_LOOP;
336 match_state(t->to_state, prevmatch,
337 inchar, incharsleft, m);
339 } while (t != s->lasttrans );
341 m->errorcode = YAZ_NFA_OVERRUN;
344 if (s->result) { /* terminal node */
345 if ( (m->longest < inchar) || /* longer result */
346 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
347 /* or as long, but with lower node number. Still better */
350 m->bestnode = s->num;
351 m->result = s->result;
352 if (m->n->curr_backrefs)
353 for (i = 0; i<m->n->nbackrefs; i++) {
354 m->n->best_backrefs[i]=m->n->curr_backrefs[i];
358 if (s->backref_start)
359 m->n->curr_backrefs[s->backref_start].start = 0;
361 m->n->curr_backrefs[s->backref_end].end = 0;
362 m->n->curr_backrefs[0].start = 0;
363 m->n->curr_backrefs[0].end = 0;
366 int yaz_nfa_match(yaz_nfa *n,
367 yaz_nfa_char **inbuff,
373 if (!n->firststate) {
374 n->lastmatch = YAZ_NFA_NOMATCH;
379 m.bestnode = n->nstates;
381 m.errorcode = YAZ_NFA_SUCCESS;
383 sz = sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
384 if (!n->curr_backrefs) {
385 n->curr_backrefs = nmem_malloc( n->nmem, sz);
386 n->best_backrefs = nmem_malloc( n->nmem, sz);
388 for (i = 0; i<n->nbackrefs; i++) {
389 n->curr_backrefs[i].start = 0;
390 n->curr_backrefs[i].end = 0;
391 n->best_backrefs[i].start = 0;
392 n->best_backrefs[i].end = 0;
395 match_state(n->firststate, *inbuff, *inbuff, *incharsleft, &m);
396 if (m.errorcode==YAZ_NFA_SUCCESS) {
398 m.errorcode=YAZ_NFA_NOMATCH;
400 *incharsleft -= (m.longest-*inbuff);
405 n->lastmatch=m.errorcode;
410 int yaz_nfa_get_backref( yaz_nfa *n,
412 yaz_nfa_char **start,
413 yaz_nfa_char **end) {
414 if (backref_no >= n->nbackrefs)
415 return YAZ_NFA_NOSUCHBACKREF;
417 return YAZ_NFA_NOSUCHBACKREF;
418 if (n->lastmatch != YAZ_NFA_SUCCESS)
419 return YAZ_NFA_NOMATCH;
421 *start = n->best_backrefs[backref_no].start;
422 *end = n->best_backrefs[backref_no].end;
426 /* * * * * * * * * * * * * *
428 * * * * * * * * * * * * * */
430 static yaz_nfa_converter *create_null_converter ( yaz_nfa *n) {
431 yaz_nfa_converter *c;
432 c=nmem_malloc(n->nmem, sizeof(yaz_nfa_converter));
442 yaz_nfa_converter *yaz_nfa_create_string_converter (
444 yaz_nfa_char *string,
446 yaz_nfa_converter *c;
448 c=create_null_converter(n);
450 c->string=nmem_malloc(n->nmem, length*sizeof(yaz_nfa_char));
451 for (i=0;i<length;i++)
452 c->string[i]=string[i];
457 yaz_nfa_converter *yaz_nfa_create_backref_converter (
458 yaz_nfa *n, int backref_no ) {
459 yaz_nfa_converter *c;
460 c=create_null_converter(n);
461 c->type=conv_backref;
462 c->backref_no=backref_no;
466 yaz_nfa_converter *yaz_nfa_create_range_converter (
467 yaz_nfa *n, int backref_no,
468 yaz_nfa_char from_char,
469 yaz_nfa_char to_char){
470 yaz_nfa_converter *c;
471 c=create_null_converter(n);
473 c->backref_no=backref_no;
474 c->char_diff=to_char - from_char;
480 void yaz_nfa_append_converter (
482 yaz_nfa_converter *startpoint,
483 yaz_nfa_converter *newconverter) {
484 while (startpoint->next)
485 startpoint=startpoint->next;
486 startpoint->next=newconverter;
489 static int string_convert (
491 yaz_nfa_converter *c,
492 yaz_nfa_char **outbuff,
493 size_t *outcharsleft){
495 yaz_nfa_char *p=c->string;
497 if ((*outcharsleft)-- <= 0)
498 return YAZ_NFA_NOSPACE;
502 return YAZ_NFA_SUCCESS;
504 static int backref_convert (
506 yaz_nfa_converter *c,
507 yaz_nfa_char **outbuff,
508 size_t *outcharsleft){
509 yaz_nfa_char *cp1,*cp2;
511 i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
512 if ( i == YAZ_NFA_NOSUCHBACKREF) /* no backref, produce no output */
513 return YAZ_NFA_SUCCESS;
514 if ( i == YAZ_NFA_NOMATCH ) /* no match in dfa */
515 return 1; /* should not happen */
517 if ((*outcharsleft)-- <= 0)
518 return YAZ_NFA_NOSPACE;
522 return YAZ_NFA_SUCCESS;
525 static int range_convert (
527 yaz_nfa_converter *c,
528 yaz_nfa_char **outbuff,
529 size_t *outcharsleft){
530 yaz_nfa_char *cp1=0, *cp2=0;
532 i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
533 if (i == YAZ_NFA_NOSUCHBACKREF) /* no backref, produce no output, not ok */
534 return YAZ_NFA_NOSUCHBACKREF; /* should not happen */
535 if (i == YAZ_NFA_NOMATCH) /* no match in dfa */
536 return YAZ_NFA_NOMATCH; /* should not happen */
538 if ((*outcharsleft)-- <= 0)
539 return YAZ_NFA_NOSPACE;
540 **outbuff=(*cp1++) + c->char_diff ;
543 return YAZ_NFA_SUCCESS;
547 int yaz_nfa_run_converters(
549 yaz_nfa_converter *c,
550 yaz_nfa_char **outbuff,
551 size_t *outcharsleft){
556 rc=string_convert(n,c,outbuff,outcharsleft);
559 rc=backref_convert(n,c,outbuff,outcharsleft);
562 rc=range_convert(n,c,outbuff,outcharsleft);
565 rc=YAZ_NFA_INTERNAL; /* should never happen */
573 * High-level interface
574 * These routines build the nfa and add converters, all
578 int yaz_nfa_add_string_rule( yaz_nfa *n,
579 yaz_nfa_char *from_string,
581 yaz_nfa_char *to_string,
584 yaz_nfa_add_sequence(n, 0, from_string,from_length);
585 yaz_nfa_converter *c=
586 yaz_nfa_create_string_converter(n,to_string,to_length);
587 return yaz_nfa_set_result(n,s,c);
590 int yaz_nfa_add_ascii_string_rule( yaz_nfa *n,
593 size_t from_len = strlen(from_string);
594 size_t to_len = strlen(to_string);
595 yaz_nfa_char *from_buf=
596 nmem_malloc(n->nmem, from_len*sizeof(yaz_nfa_char));
597 yaz_nfa_char *to_buf=
598 nmem_malloc(n->nmem, to_len*sizeof(yaz_nfa_char));
600 for (i=0;i<from_len;i++)
601 from_buf[i]=from_string[i];
602 for (i=0;i<to_len;i++)
603 to_buf[i]=to_string[i];
604 return yaz_nfa_add_string_rule(n,from_buf, from_len,
608 int yaz_nfa_add_char_range_rule (yaz_nfa *n,
609 yaz_nfa_char range_start,
610 yaz_nfa_char range_end,
611 yaz_nfa_char output_range_start) {
613 yaz_nfa_add_range(n, 0, range_start, range_end);
614 yaz_nfa_converter *c=
615 yaz_nfa_create_range_converter(n,0,range_start, output_range_start);
616 return yaz_nfa_set_result(n,s,c);
619 int yaz_nfa_add_char_string_rule (yaz_nfa *n,
620 yaz_nfa_char range_start,
621 yaz_nfa_char range_end,
622 yaz_nfa_char* to_string,
625 yaz_nfa_add_range(n, 0, range_start, range_end);
626 yaz_nfa_converter *c=
627 yaz_nfa_create_string_converter(n,to_string,to_length);
628 return yaz_nfa_set_result(n,s,c);
632 int yaz_nfa_convert_slice (yaz_nfa *n,
633 yaz_nfa_char **inbuff,
635 yaz_nfa_char **outbuff,
636 size_t *outcharsleft) {
638 yaz_nfa_converter *conv;
641 if (*outcharsleft==0)
642 rc=YAZ_NFA_NOSPACE; /* no room in outbuff */
643 else if (*incharsleft==0)
644 rc = YAZ_NFA_SUCCESS; /* all done */
646 rc=yaz_nfa_match(n, inbuff, incharsleft, &resptr);
647 if (rc==YAZ_NFA_SUCCESS) {
648 conv= (yaz_nfa_converter *)resptr;
649 rc=yaz_nfa_run_converters(n,conv,outbuff,outcharsleft);
650 } else if (rc==YAZ_NFA_NOMATCH) {
651 **outbuff = **inbuff;
658 /* else just return the error code */
667 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
670 return n->firststate;
673 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
683 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
690 if ( (t->range_start <= ' ') || (t->range_start>'z'))
692 if ( (t->range_end <= ' ') || (t->range_end>'z'))
694 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
697 fprintf(F, " -> [%d] %s '%c' %x - '%c' %x \n",
703 static void dump_state(FILE *F, yaz_nfa_state *s,
704 char *(*strfunc)(void *) ) {
705 yaz_nfa_transition *t;
706 char *resultstring = "";
709 resultstring = (*strfunc)(s->result);
712 resultstring = s->result;
714 fprintf(F, " state [%d] %s %s",
715 s->num, s->result?"(final)":"", resultstring );
716 if (s->backref_start) {
717 fprintf(F, " start-%d", s->backref_start);
719 if (s->backref_end) {
720 fprintf(F, " end-%d", s->backref_end);
725 fprintf(F, " (no transitions)\n");
730 } while (t != s->lasttrans);
735 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
736 char *(*strfunc)(void *) ) {
738 if (!F) /* lazy programmers can just pass 0 for F */
740 fprintf(F, "The NFA has %d states and %d backrefs\n",
741 n->nstates, n->nbackrefs);
746 dump_state(F, s, strfunc);
747 } while (s != n->laststate);
755 * indent-tabs-mode: nil
757 * vim: shiftwidth=4 tabstop=8 expandtab