1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.7 2006-05-05 14:02:27 heikki Exp $
9 * \brief NFA for character set normalizing
11 * This is a simple NFA-based system for character set normalizing
12 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
13 * characters at a time, so as to keep the size small.
15 * All characters are internally handled as unsigned 32-bit ints, so
16 * this works well with unicode. Translating to and from utf-8 is trivial, if
19 * The NFA stores a translation-thing in the terminal nodes. It does not
20 * concern itself with what that thing is, for us it is juts a void*
22 * The module consists of two parts: Building the NFA, and translating
34 typedef struct yaz_nfa_backref_info yaz_backref_info;
36 struct yaz_nfa_backref_info {
43 int nstates; /* how many states do we have */
44 int nbackrefs; /* how many backrefs do we have */
45 yaz_nfa_state *laststate; /* points to the last in the circular list */
46 yaz_nfa_state *firststate; /* the initial state */
47 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
48 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
49 int lastmatch; /* the result of last match */
52 struct yaz_nfa_state {
53 int num; /* state number. used for resoving ambiguities, and for dumping */
54 void *result; /* signals a terminal state, and gives the result */
55 yaz_nfa_transition *lasttrans; /* circular list of transitions */
56 yaz_nfa_state *next; /* Circular linked list */
57 int backref_start; /* which backreference starts here. 0 if none */
58 int backref_end; /* which backreference ends here. 0 if none */
61 struct yaz_nfa_transition {
62 yaz_nfa_state *to_state; /* where to */
63 yaz_nfa_char range_start;
64 yaz_nfa_char range_end;
65 yaz_nfa_transition *next; /* a linked list of them */
68 /* a range from 1 down to 0 is impossible, and is used to denote an */
69 /* empty (epsilon) transition */
73 /* a limit for the number of empty transitions we can have in a row before */
74 /* we declare this a loop */
75 #define LOOP_LIMIT 100
83 } yaz_nfa_converter_type;
85 struct yaz_nfa_converter {
86 yaz_nfa_converter *next;
87 yaz_nfa_converter_type type;
96 * Initializing and destroying whole nfa's
99 yaz_nfa *yaz_nfa_init() {
100 NMEM my_nmem = nmem_create();
101 yaz_nfa *n = nmem_malloc(my_nmem, sizeof(yaz_nfa));
107 n->curr_backrefs = 0;
108 n->best_backrefs = 0;
109 n->lastmatch = YAZ_NFA_NOMATCH;
113 void yaz_nfa_destroy(yaz_nfa *n) {
114 nmem_destroy(n->nmem);
119 * Low-level interface to building the NFA
122 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
123 yaz_nfa_state *s = nmem_malloc(n->nmem, sizeof(yaz_nfa_state));
124 s->num = (n->nstates)++;
127 s->backref_start = 0;
130 s->next = n->laststate->next;
131 n->laststate->next = s;
133 } else { /* first state */
141 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s, void *result) {
142 if ((s->result)&&result)
148 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
154 int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
158 if (s->backref_start)
160 s->backref_start = backref_number;
161 if (n->nbackrefs<=backref_number) {
162 n->nbackrefs = backref_number+1;
163 n->curr_backrefs = 0;
164 n->best_backrefs = 0;
165 /* clear them just in case we have already matched on */
166 /* with this nfa, and created a too small backref table */
167 /* we will reallocate when matching. */
172 if (n->nbackrefs<backref_number)
173 return 2; /* can't end a backref that has not been started */
174 s->backref_end = backref_number;
179 int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
184 return s->backref_start;
186 return s->backref_end;
189 void yaz_nfa_add_transition(yaz_nfa *n,
190 yaz_nfa_state *from_state,
191 yaz_nfa_state *to_state,
192 yaz_nfa_char range_start,
193 yaz_nfa_char range_end) {
194 yaz_nfa_transition *t = nmem_malloc(n->nmem, sizeof(yaz_nfa_transition));
196 from_state = n->firststate;
197 t->range_start = range_start;
198 t->range_end = range_end;
199 t->to_state = to_state;
200 if (from_state->lasttrans) {
201 t->next= from_state->lasttrans->next;
202 from_state->lasttrans->next = t;
203 from_state->lasttrans = t;
204 } else { /* first trans */
205 from_state->lasttrans = t;
210 void yaz_nfa_add_empty_transition( yaz_nfa *n,
211 yaz_nfa_state *from_state,
212 yaz_nfa_state *to_state) {
213 yaz_nfa_add_transition(n, from_state, to_state,
214 EMPTY_START, EMPTY_END);
218 * Medium-level interface
221 /* Finds a transition from s where the range is exactly c..c */
222 /* There should only be one such transition */
223 static yaz_nfa_state *find_single_trans(
225 yaz_nfa_char range_start,
226 yaz_nfa_char range_end) {
227 yaz_nfa_transition *t = s->lasttrans;
232 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
234 } while (t != s->lasttrans );
239 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
241 yaz_nfa_char range_start,
242 yaz_nfa_char range_end) {
243 yaz_nfa_state *nextstate;
244 if (!s) /* default to top-level of the nfa */
246 nextstate = find_single_trans(s, range_start, range_end);
248 nextstate = yaz_nfa_add_state(n);
249 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
254 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
257 yaz_nfa_state *nextstate;
258 if (!s) /* default to top-level of the nfa */
260 nextstate = find_single_trans(s, *seq, *seq);
263 if (!*seq) /* whole sequence matched */
266 return yaz_nfa_add_sequence(n, nextstate, seq);
267 } else { /* no next state, build the rest */
269 s = yaz_nfa_add_range(n, s, *seq, *seq);
274 return 0; /* compiler shut up, we return somewhere above */
285 yaz_nfa_char *longest;
289 int empties; /* count how many consecutive empty transitions */
292 /* Finds the longest match. In case of ties, chooses the one with the
293 * lowest numbered state. Keep track of the back references. Recursively
294 * traverses the NFA. Keeps track of the best hit it has found. */
296 static void match_state(
298 yaz_nfa_char *prevmatch,
299 yaz_nfa_char *inchar,
301 struct matcher *m ) {
302 yaz_nfa_transition *t = s->lasttrans;
303 if (s->backref_start)
304 m->n->curr_backrefs[s->backref_start].start = inchar;
306 m->n->curr_backrefs[s->backref_end].end = prevmatch;
311 if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
313 if (t->range_start!=t->range_end){
314 /* backref 0 is special: the last range operation */
315 m->n->curr_backrefs[0].start = inchar;
316 m->n->curr_backrefs[0].end = inchar;
318 match_state(t->to_state, inchar,
319 inchar+1, incharsleft-1, m);
320 } else if (( t->range_start==EMPTY_START) &&
321 (t->range_end==EMPTY_END)) {
322 if ( m->empties++ > LOOP_LIMIT )
323 m->errorcode= YAZ_NFA_LOOP;
325 match_state(t->to_state, prevmatch,
326 inchar, incharsleft, m);
328 } while (t != s->lasttrans );
330 m->errorcode = YAZ_NFA_OVERRUN;
333 if (s->result) { /* terminal node */
334 if ( (m->longest < inchar) || /* longer result */
335 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
336 /* or as long, but with lower node number. Still better */
339 m->bestnode = s->num;
340 m->result = s->result;
341 if (m->n->curr_backrefs)
342 for (i = 0; i<m->n->nbackrefs; i++) {
343 m->n->best_backrefs[i]=m->n->curr_backrefs[i];
347 if (s->backref_start)
348 m->n->curr_backrefs[s->backref_start].start = 0;
350 m->n->curr_backrefs[s->backref_end].end = 0;
351 m->n->curr_backrefs[0].start = 0;
352 m->n->curr_backrefs[0].end = 0;
355 int yaz_nfa_match(yaz_nfa *n,
356 yaz_nfa_char **inbuff,
362 if (!n->firststate) {
363 n->lastmatch = YAZ_NFA_NOMATCH;
368 m.bestnode = n->nstates;
372 sz = sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
373 if (!n->curr_backrefs) {
374 n->curr_backrefs = nmem_malloc( n->nmem, sz);
375 n->best_backrefs = nmem_malloc( n->nmem, sz);
377 for (i = 0; i<n->nbackrefs; i++) {
378 n->curr_backrefs[i].start = 0;
379 n->curr_backrefs[i].end = 0;
380 n->best_backrefs[i].start = 0;
381 n->best_backrefs[i].end = 0;
384 match_state(n->firststate, *inbuff, *inbuff, *incharsleft, &m);
386 *incharsleft -= (m.longest-*inbuff);
390 n->lastmatch = m.errorcode;
392 n->lastmatch= YAZ_NFA_SUCCESS;
395 n->lastmatch = YAZ_NFA_NOMATCH;
400 int yaz_nfa_get_backref( yaz_nfa *n,
402 yaz_nfa_char **start,
403 yaz_nfa_char **end) {
404 if (backref_no>=n->nbackrefs)
408 if (n->lastmatch== YAZ_NFA_NOMATCH)
409 return 1; /* accept other errors, they return partial matches*/
411 *start = n->best_backrefs[backref_no].start;
412 *end = n->best_backrefs[backref_no].end;
416 /* * * * * * * * * * * * * *
418 * * * * * * * * * * * * * */
420 static yaz_nfa_converter *create_null_converter ( yaz_nfa *n) {
421 yaz_nfa_converter *c;
422 c=nmem_malloc(n->nmem, sizeof(yaz_nfa_converter));
432 yaz_nfa_converter *yaz_nfa_create_string_converter (
434 yaz_nfa_char *string,
436 yaz_nfa_converter *c;
438 c=create_null_converter(n);
440 c->string=nmem_malloc(n->nmem, length*sizeof(yaz_nfa_char));
441 for (i=0;i<length;i++)
442 c->string[i]=string[i];
447 yaz_nfa_converter *yaz_nfa_create_backref_converter (
448 yaz_nfa *n, int backref_no ) {
449 yaz_nfa_converter *c;
450 c=create_null_converter(n);
451 c->type=conv_backref;
452 c->backref_no=backref_no;
456 yaz_nfa_converter *yaz_nfa_create_range_converter (
457 yaz_nfa *n, int backref_no,
458 yaz_nfa_char from_char,
459 yaz_nfa_char to_char){
460 yaz_nfa_converter *c;
461 c=create_null_converter(n);
463 c->backref_no=backref_no;
464 c->char_diff=to_char - from_char;
470 void yaz_nfa_append_converter (
472 yaz_nfa_converter *startpoint,
473 yaz_nfa_converter *newconverter) {
474 while (startpoint->next)
475 startpoint=startpoint->next;
476 startpoint->next=newconverter;
479 static int string_convert (
481 yaz_nfa_converter *c,
482 yaz_nfa_char **outbuff,
483 size_t *outcharsleft){
485 yaz_nfa_char *p=c->string;
487 if ((*outcharsleft)-- <= 0)
494 static int backref_convert (
496 yaz_nfa_converter *c,
497 yaz_nfa_char **outbuff,
498 size_t *outcharsleft){
499 yaz_nfa_char *cp1,*cp2;
501 i=yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
502 if (i==2) /* no backref, produce no output, that's ok */
504 if (i==1) /* no match in dfa */
505 return 1; /* should not happen */
507 if ((*outcharsleft)-- <= 0)
515 static int range_convert (
517 yaz_nfa_converter *c,
518 yaz_nfa_char **outbuff,
519 size_t *outcharsleft){
520 yaz_nfa_char *cp1,*cp2;
522 i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
523 printf ("range_convert: i=%d d=%d, cp1=%p cp2=%p \n",i,c->char_diff,cp1,cp2);
524 if (i == 2) /* no backref, produce no output, not ok */
525 return 1; /* should not happen */
526 if (i == 1) /* no match in dfa */
527 return 1; /* should not happen */
529 if ((*outcharsleft)-- <= 0)
531 printf(" range_convert: %d '%c' -> ",*cp1,*cp1);
532 **outbuff=(*cp1++) + c->char_diff ;
533 printf("%d '%c'\n",**outbuff, **outbuff);
540 int yaz_nfa_run_converters(
542 yaz_nfa_converter *c,
543 yaz_nfa_char **outbuff,
544 size_t *outcharsleft){
549 rc=string_convert(n,c,outbuff,outcharsleft);
552 rc=backref_convert(n,c,outbuff,outcharsleft);
555 rc=range_convert(n,c,outbuff,outcharsleft);
558 rc=3; /* internal error */
569 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
572 return n->firststate;
575 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
585 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
592 if ( (t->range_start <= ' ') || (t->range_start>'z'))
594 if ( (t->range_end <= ' ') || (t->range_end>'z'))
596 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
599 fprintf(F, " -> [%d] %s '%c' %x - '%c' %x \n",
605 static void dump_state(FILE *F, yaz_nfa_state *s,
606 char *(*strfunc)(void *) ) {
607 yaz_nfa_transition *t;
608 char *resultstring = "";
611 resultstring = (*strfunc)(s->result);
614 resultstring = s->result;
616 fprintf(F, " state [%d] %s %s",
617 s->num, s->result?"(FINAL)":"", resultstring );
618 if (s->backref_start) {
619 fprintf(F, " start-%d", s->backref_start);
621 if (s->backref_end) {
622 fprintf(F, " end-%d", s->backref_end);
627 fprintf(F, " (no transitions)\n");
632 } while (t != s->lasttrans);
637 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
638 char *(*strfunc)(void *) ) {
640 if (!F) /* lazy programmers can just pass 0 for F */
642 fprintf(F, "The NFA has %d states and %d backrefs\n",
643 n->nstates, n->nbackrefs);
648 dump_state(F, s, strfunc);
649 } while (s != n->laststate);
657 * indent-tabs-mode: nil
659 * vim: shiftwidth=4 tabstop=8 expandtab