1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.9 2006-05-10 13:58:46 heikki Exp $
9 * \brief NFA for character set normalizing
11 * This is a simple NFA-based system for character set normalizing
12 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
13 * characters at a time, so as to keep the size small.
15 * All characters are internally handled as unsigned 32-bit ints, so
16 * this works well with unicode. Translating to and from utf-8 is trivial, if
19 * The NFA stores a translation-thing in the terminal nodes. It does not
20 * concern itself with what that thing is, for us it is juts a void*
22 * The module consists of two parts: Building the NFA, and translating
36 typedef struct yaz_nfa_backref_info yaz_backref_info;
38 struct yaz_nfa_backref_info {
45 int nstates; /* how many states do we have */
46 int nbackrefs; /* how many backrefs do we have */
47 yaz_nfa_state *laststate; /* points to the last in the circular list */
48 yaz_nfa_state *firststate; /* the initial state */
49 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
50 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
51 int lastmatch; /* the result of last match */
54 struct yaz_nfa_state {
55 int num; /* state number. used for resoving ambiguities, and for dumping */
56 void *result; /* signals a terminal state, and gives the result */
57 yaz_nfa_transition *lasttrans; /* circular list of transitions */
58 yaz_nfa_state *next; /* Circular linked list */
59 int backref_start; /* which backreference starts here. 0 if none */
60 int backref_end; /* which backreference ends here. 0 if none */
63 struct yaz_nfa_transition {
64 yaz_nfa_state *to_state; /* where to */
65 yaz_nfa_char range_start;
66 yaz_nfa_char range_end;
67 yaz_nfa_transition *next; /* a linked list of them */
70 /* a range from 1 down to 0 is impossible, and is used to denote an */
71 /* empty (epsilon) transition */
75 /* a limit for the number of empty transitions we can have in a row before */
76 /* we declare this a loop */
77 #define LOOP_LIMIT 100
85 } yaz_nfa_converter_type;
87 struct yaz_nfa_converter {
88 yaz_nfa_converter *next;
89 yaz_nfa_converter_type type;
98 * Initializing and destroying whole nfa's
101 yaz_nfa *yaz_nfa_init() {
102 NMEM my_nmem = nmem_create();
103 yaz_nfa *n = nmem_malloc(my_nmem, sizeof(yaz_nfa));
105 n->nbackrefs = 1; /* we always have #0, last range match */
106 n->curr_backrefs = 0;
107 n->best_backrefs = 0;
108 n->lastmatch = YAZ_NFA_NOMATCH;
111 n->firststate = n->laststate ;
115 void yaz_nfa_destroy(yaz_nfa *n) {
116 nmem_destroy(n->nmem);
121 * Low-level interface to building the NFA
124 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
125 yaz_nfa_state *s = nmem_malloc(n->nmem, sizeof(yaz_nfa_state));
126 s->num = (n->nstates)++;
129 s->backref_start = 0;
132 s->next = n->laststate->next;
133 n->laststate->next = s;
135 } else { /* first state */
143 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s, void *result) {
144 if ((s->result)&&result)
145 return YAZ_NFA_ALREADY;
150 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
156 int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
160 if (s->backref_start)
161 return YAZ_NFA_ALREADY;
162 s->backref_start = backref_number;
163 if (n->nbackrefs<=backref_number) {
164 n->nbackrefs = backref_number+1;
165 n->curr_backrefs = 0;
166 n->best_backrefs = 0;
167 /* clear them just in case we have already matched on */
168 /* with this nfa, and created a too small backref table */
169 /* we will reallocate when matching. */
173 return YAZ_NFA_ALREADY;
174 if (n->nbackrefs<=backref_number)
175 return YAZ_NFA_NOSTART;
176 s->backref_end = backref_number;
181 int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
186 return s->backref_start;
188 return s->backref_end;
191 void yaz_nfa_add_transition(yaz_nfa *n,
192 yaz_nfa_state *from_state,
193 yaz_nfa_state *to_state,
194 yaz_nfa_char range_start,
195 yaz_nfa_char range_end) {
196 yaz_nfa_transition *t = nmem_malloc(n->nmem, sizeof(yaz_nfa_transition));
198 from_state = n->firststate;
199 t->range_start = range_start;
200 t->range_end = range_end;
201 t->to_state = to_state;
202 if (from_state->lasttrans) {
203 t->next= from_state->lasttrans->next;
204 from_state->lasttrans->next = t;
205 from_state->lasttrans = t;
206 } else { /* first trans */
207 from_state->lasttrans = t;
212 void yaz_nfa_add_empty_transition( yaz_nfa *n,
213 yaz_nfa_state *from_state,
214 yaz_nfa_state *to_state) {
215 yaz_nfa_add_transition(n, from_state, to_state,
216 EMPTY_START, EMPTY_END);
220 * Medium-level interface
223 /* Finds a transition from s where the range is exactly c..c */
224 /* There should only be one such transition */
225 static yaz_nfa_state *find_single_trans(
227 yaz_nfa_char range_start,
228 yaz_nfa_char range_end) {
229 yaz_nfa_transition *t = s->lasttrans;
234 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
236 } while (t != s->lasttrans );
241 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
243 yaz_nfa_char range_start,
244 yaz_nfa_char range_end) {
245 yaz_nfa_state *nextstate=0;
246 if (!s) /* default to top-level of the nfa */
249 nextstate = find_single_trans(s, range_start, range_end);
251 s = yaz_nfa_add_state(n); /* create initial state */
253 nextstate = yaz_nfa_add_state(n);
254 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
259 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
263 yaz_nfa_state *nextstate=0;
264 if (!s) /* default to top-level of the nfa */
267 nextstate = find_single_trans(s, *seq, *seq);
271 if (!seq_len) /* whole sequence matched */
274 return yaz_nfa_add_sequence(n, nextstate, seq,seq_len);
275 } else { /* no next state, build the rest */
277 s = yaz_nfa_add_range(n, s, *seq, *seq);
283 return 0; /* compiler shut up, we return somewhere above */
294 yaz_nfa_char *longest;
298 int empties; /* count how many consecutive empty transitions */
301 /* Finds the longest match. In case of ties, chooses the one with the
302 * lowest numbered state. Keep track of the back references. Recursively
303 * traverses the NFA. Keeps track of the best hit it has found. */
305 static void match_state(
307 yaz_nfa_char *prevmatch,
308 yaz_nfa_char *inchar,
310 struct matcher *m ) {
311 yaz_nfa_transition *t = s->lasttrans;
312 if (s->backref_start)
313 m->n->curr_backrefs[s->backref_start].start = inchar;
315 m->n->curr_backrefs[s->backref_end].end = prevmatch;
320 if ( (( t->range_start <= *inchar ) &&
321 ( t->range_end >= *inchar )) ){
323 if (t->range_start!=t->range_end){
324 /* backref 0 is special: the last range operation */
325 m->n->curr_backrefs[0].start = inchar;
326 m->n->curr_backrefs[0].end = inchar;
328 match_state(t->to_state, inchar,
329 inchar+1, incharsleft-1, m);
330 } else if (( t->range_start==EMPTY_START) &&
331 (t->range_end==EMPTY_END)) {
332 if ( m->empties++ > LOOP_LIMIT )
333 m->errorcode= YAZ_NFA_LOOP;
335 match_state(t->to_state, prevmatch,
336 inchar, incharsleft, m);
338 } while (t != s->lasttrans );
340 m->errorcode = YAZ_NFA_OVERRUN;
343 if (s->result) { /* terminal node */
344 if ( (m->longest < inchar) || /* longer result */
345 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
346 /* or as long, but with lower node number. Still better */
349 m->bestnode = s->num;
350 m->result = s->result;
351 if (m->n->curr_backrefs)
352 for (i = 0; i<m->n->nbackrefs; i++) {
353 m->n->best_backrefs[i]=m->n->curr_backrefs[i];
357 if (s->backref_start)
358 m->n->curr_backrefs[s->backref_start].start = 0;
360 m->n->curr_backrefs[s->backref_end].end = 0;
361 m->n->curr_backrefs[0].start = 0;
362 m->n->curr_backrefs[0].end = 0;
365 int yaz_nfa_match(yaz_nfa *n,
366 yaz_nfa_char **inbuff,
372 if (!n->firststate) {
373 n->lastmatch = YAZ_NFA_NOMATCH;
378 m.bestnode = n->nstates;
380 m.errorcode = YAZ_NFA_SUCCESS;
382 sz = sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
383 if (!n->curr_backrefs) {
384 n->curr_backrefs = nmem_malloc( n->nmem, sz);
385 n->best_backrefs = nmem_malloc( n->nmem, sz);
387 for (i = 0; i<n->nbackrefs; i++) {
388 n->curr_backrefs[i].start = 0;
389 n->curr_backrefs[i].end = 0;
390 n->best_backrefs[i].start = 0;
391 n->best_backrefs[i].end = 0;
394 match_state(n->firststate, *inbuff, *inbuff, *incharsleft, &m);
395 if (m.errorcode==YAZ_NFA_SUCCESS) {
397 m.errorcode=YAZ_NFA_NOMATCH;
399 *incharsleft -= (m.longest-*inbuff);
404 n->lastmatch=m.errorcode;
409 int yaz_nfa_get_backref( yaz_nfa *n,
411 yaz_nfa_char **start,
412 yaz_nfa_char **end) {
413 if (backref_no >= n->nbackrefs)
414 return YAZ_NFA_NOSUCHBACKREF;
416 return YAZ_NFA_NOSUCHBACKREF;
417 if (n->lastmatch != YAZ_NFA_SUCCESS)
418 return YAZ_NFA_NOMATCH;
420 *start = n->best_backrefs[backref_no].start;
421 *end = n->best_backrefs[backref_no].end;
425 /* * * * * * * * * * * * * *
427 * * * * * * * * * * * * * */
429 static yaz_nfa_converter *create_null_converter ( yaz_nfa *n) {
430 yaz_nfa_converter *c;
431 c=nmem_malloc(n->nmem, sizeof(yaz_nfa_converter));
441 yaz_nfa_converter *yaz_nfa_create_string_converter (
443 yaz_nfa_char *string,
445 yaz_nfa_converter *c;
447 c=create_null_converter(n);
449 c->string=nmem_malloc(n->nmem, length*sizeof(yaz_nfa_char));
450 for (i=0;i<length;i++)
451 c->string[i]=string[i];
456 yaz_nfa_converter *yaz_nfa_create_backref_converter (
457 yaz_nfa *n, int backref_no ) {
458 yaz_nfa_converter *c;
459 c=create_null_converter(n);
460 c->type=conv_backref;
461 c->backref_no=backref_no;
465 yaz_nfa_converter *yaz_nfa_create_range_converter (
466 yaz_nfa *n, int backref_no,
467 yaz_nfa_char from_char,
468 yaz_nfa_char to_char){
469 yaz_nfa_converter *c;
470 c=create_null_converter(n);
472 c->backref_no=backref_no;
473 c->char_diff=to_char - from_char;
479 void yaz_nfa_append_converter (
481 yaz_nfa_converter *startpoint,
482 yaz_nfa_converter *newconverter) {
483 while (startpoint->next)
484 startpoint=startpoint->next;
485 startpoint->next=newconverter;
488 static int string_convert (
490 yaz_nfa_converter *c,
491 yaz_nfa_char **outbuff,
492 size_t *outcharsleft){
494 yaz_nfa_char *p=c->string;
496 if ((*outcharsleft)-- <= 0)
497 return YAZ_NFA_NOSPACE;
501 return YAZ_NFA_SUCCESS;
503 static int backref_convert (
505 yaz_nfa_converter *c,
506 yaz_nfa_char **outbuff,
507 size_t *outcharsleft){
508 yaz_nfa_char *cp1,*cp2;
510 i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
511 if ( i == YAZ_NFA_NOSUCHBACKREF) /* no backref, produce no output */
512 return YAZ_NFA_SUCCESS;
513 if ( i == YAZ_NFA_NOMATCH ) /* no match in dfa */
514 return 1; /* should not happen */
516 if ((*outcharsleft)-- <= 0)
517 return YAZ_NFA_NOSPACE;
521 return YAZ_NFA_SUCCESS;
524 static int range_convert (
526 yaz_nfa_converter *c,
527 yaz_nfa_char **outbuff,
528 size_t *outcharsleft){
529 yaz_nfa_char *cp1=0, *cp2=0;
531 i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
532 if (i == YAZ_NFA_NOSUCHBACKREF) /* no backref, produce no output, not ok */
533 return YAZ_NFA_NOSUCHBACKREF; /* should not happen */
534 if (i == YAZ_NFA_NOMATCH) /* no match in dfa */
535 return YAZ_NFA_NOMATCH; /* should not happen */
537 if ((*outcharsleft)-- <= 0)
538 return YAZ_NFA_NOSPACE;
539 **outbuff=(*cp1++) + c->char_diff ;
542 return YAZ_NFA_SUCCESS;
546 int yaz_nfa_run_converters(
548 yaz_nfa_converter *c,
549 yaz_nfa_char **outbuff,
550 size_t *outcharsleft){
555 rc=string_convert(n,c,outbuff,outcharsleft);
558 rc=backref_convert(n,c,outbuff,outcharsleft);
561 rc=range_convert(n,c,outbuff,outcharsleft);
564 rc=YAZ_NFA_INTERNAL; /* should never happen */
572 * High-level interface
573 * These routines build the nfa and add converters, all
577 int yaz_nfa_add_string_rule( yaz_nfa *n,
578 yaz_nfa_char *from_string,
580 yaz_nfa_char *to_string,
583 yaz_nfa_add_sequence(n, 0, from_string,from_length);
584 yaz_nfa_converter *c=
585 yaz_nfa_create_string_converter(n,to_string,to_length);
586 return yaz_nfa_set_result(n,s,c);
589 int yaz_nfa_add_ascii_string_rule( yaz_nfa *n,
592 size_t from_len = strlen(from_string);
593 size_t to_len = strlen(to_string);
594 yaz_nfa_char *from_buf=
595 nmem_malloc(n->nmem, from_len*sizeof(yaz_nfa_char));
596 yaz_nfa_char *to_buf=
597 nmem_malloc(n->nmem, to_len*sizeof(yaz_nfa_char));
599 for (i=0;i<from_len;i++)
600 from_buf[i]=from_string[i];
601 for (i=0;i<to_len;i++)
602 to_buf[i]=to_string[i];
603 return yaz_nfa_add_string_rule(n,from_buf, from_len,
607 int yaz_nfa_add_char_range_rule (yaz_nfa *n,
608 yaz_nfa_char range_start,
609 yaz_nfa_char range_end,
610 yaz_nfa_char output_range_start) {
612 yaz_nfa_add_range(n, 0, range_start, range_end);
613 yaz_nfa_converter *c=
614 yaz_nfa_create_range_converter(n,0,range_start, output_range_start);
615 return yaz_nfa_set_result(n,s,c);
618 int yaz_nfa_add_char_string_rule (yaz_nfa *n,
619 yaz_nfa_char range_start,
620 yaz_nfa_char range_end,
621 yaz_nfa_char* to_string,
624 yaz_nfa_add_range(n, 0, range_start, range_end);
625 yaz_nfa_converter *c=
626 yaz_nfa_create_string_converter(n,to_string,to_length);
627 return yaz_nfa_set_result(n,s,c);
631 int yaz_nfa_convert_slice (yaz_nfa *n,
632 yaz_nfa_char **inbuff,
634 yaz_nfa_char **outbuff,
635 size_t *outcharsleft) {
637 yaz_nfa_converter *conv;
640 if (*outcharsleft==0)
641 rc=YAZ_NFA_NOSPACE; /* no room in outbuff */
642 else if (*incharsleft==0)
643 rc = YAZ_NFA_SUCCESS; /* all done */
645 rc=yaz_nfa_match(n, inbuff, incharsleft, &resptr);
646 if (rc==YAZ_NFA_SUCCESS) {
647 conv= (yaz_nfa_converter *)resptr;
648 rc=yaz_nfa_run_converters(n,conv,outbuff,outcharsleft);
649 } else if (rc==YAZ_NFA_NOMATCH) {
650 **outbuff = **inbuff;
657 /* else just return the error code */
666 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
669 return n->firststate;
672 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
682 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
689 if ( (t->range_start <= ' ') || (t->range_start>'z'))
691 if ( (t->range_end <= ' ') || (t->range_end>'z'))
693 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
696 fprintf(F, " -> [%d] %s '%c' %x - '%c' %x \n",
702 static void dump_state(FILE *F, yaz_nfa_state *s,
703 char *(*strfunc)(void *) ) {
704 yaz_nfa_transition *t;
705 char *resultstring = "";
708 resultstring = (*strfunc)(s->result);
711 resultstring = s->result;
713 fprintf(F, " state [%d] %s %s",
714 s->num, s->result?"(final)":"", resultstring );
715 if (s->backref_start) {
716 fprintf(F, " start-%d", s->backref_start);
718 if (s->backref_end) {
719 fprintf(F, " end-%d", s->backref_end);
724 fprintf(F, " (no transitions)\n");
729 } while (t != s->lasttrans);
734 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
735 char *(*strfunc)(void *) ) {
737 if (!F) /* lazy programmers can just pass 0 for F */
739 fprintf(F, "The NFA has %d states and %d backrefs\n",
740 n->nstates, n->nbackrefs);
745 dump_state(F, s, strfunc);
746 } while (s != n->laststate);
754 * indent-tabs-mode: nil
756 * vim: shiftwidth=4 tabstop=8 expandtab