From: Adam Dickmeiss Date: Thu, 28 Sep 2006 18:38:41 +0000 (+0000) Subject: Fixed bug #685: Optimize xelm/melm matching. Indexing the Koha collection X-Git-Tag: ZEBRA.1.3.40~5 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;h=20278c99b7f9faf69e97608a3fc2ba06815d18fa;p=idzebra-moved-to-github.git Fixed bug #685: Optimize xelm/melm matching. Indexing the Koha collection is reduced from 6073 sec to 2578 sec. --- diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index f19edbf..3edbb6a 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.c,v 1.9.2.8 2006-08-14 10:38:51 adam Exp $ +/* $Id: d1_absyn.c,v 1.9.2.9 2006-09-28 18:38:41 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -733,10 +733,11 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, int i; char *p, *xpath_expr, *termlists; - const char *regexp; - struct DFA *dfa = dfa = dfa_init(); + const char *regexp = 0; + struct DFA *dfa = 0; data1_termlist **tp; char melm_xpath[128]; + data1_xpelement *xp_old = 0; if (argc < 3) { @@ -752,14 +753,24 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, xpath_expr = argv[1]; } termlists = argv[2]; - regexp = mk_xpath_regexp(dh, xpath_expr); - i = dfa_parse (dfa, ®exp); - if (i || *regexp) { - yaz_log(LOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno); - dfa_delete (&dfa); - continue; - } - + regexp = mk_xpath_regexp(dh, xpath_expr); +#if OPTIMIZE_MELM + for (xp_old = res->xp_elements; xp_old; xp_old = xp_old->next) + if (!strcmp(xp_old->regexp, regexp)) + break; +#endif + if (!xp_old) + { + const char *regexp_ptr = regexp; + dfa = dfa_init(); + + i = dfa_parse (dfa, ®exp_ptr); + if (i || *regexp_ptr) { + yaz_log(YLOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno); + dfa_delete (&dfa); + continue; + } + } if (!cur_xpelement) { cur_xpelement = (data1_xpelement *) @@ -770,13 +781,15 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement)); cur_xpelement = cur_xpelement->next; } +#if OPTIMIZE_MELM + cur_xpelement->regexp = regexp; +#endif cur_xpelement->next = NULL; cur_xpelement->xpath_expr = nmem_strdup(data1_nmem_get (dh), xpath_expr); - - dfa_mkstate (dfa); + if (dfa) + dfa_mkstate (dfa); cur_xpelement->dfa = dfa; - #ifdef ENHANCED_XELM cur_xpelement->xpath_len = zebra_parse_xpath_str(xpath_expr, diff --git a/include/data1.h b/include/data1.h index d1dac78..ff1cc1c 100644 --- a/include/data1.h +++ b/include/data1.h @@ -1,4 +1,4 @@ -/* $Id: data1.h,v 1.9.2.2 2006-08-14 10:38:55 adam Exp $ +/* $Id: data1.h,v 1.9.2.3 2006-09-28 18:38:42 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #define DATA1_H #define ENHANCED_XELM 1 +#define OPTIMIZE_MELM 1 #include @@ -203,6 +204,10 @@ typedef struct data1_xpelement struct DFA *dfa; data1_termlist *termlists; struct data1_xpelement *next; +#if OPTIMIZE_MELM + const char *regexp; +#endif + int match_state; } data1_xpelement; typedef struct data1_xattr { @@ -212,9 +217,6 @@ typedef struct data1_xattr { unsigned short what; /* DATA1I_text, .. see data1_node.u.data */ } data1_xattr; -#if 0 -typedef struct data1_absyn data1_absyn; -#else typedef struct data1_absyn { char *name; @@ -232,7 +234,7 @@ typedef struct data1_absyn char *encoding; int enable_xpath_indexing; } data1_absyn; -#endif + /* * record data node (tag/data/variant) */ diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 030432c..b36b206 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -1,4 +1,4 @@ -/* $Id: recgrs.c,v 1.86.2.10 2006-08-14 10:39:16 adam Exp $ +/* $Id: recgrs.c,v 1.86.2.11 2006-09-28 18:38:42 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -438,28 +438,42 @@ pop, 2003-01-17 data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) { data1_absyn *abs = n->root->u.root.absyn; - data1_xpelement *xpe = abs->xp_elements; + data1_xpelement *xpe = 0; data1_node *nn; #ifdef ENHANCED_XELM struct xpath_location_step *xp; - #endif char *pexpr = xmalloc(strlen(tagpath)+5); - int ok = 0; sprintf (pexpr, "/%s\n", tagpath); #if 0 yaz_log(LOG_DEBUG, "Checking tagpath %s", tagpath); #endif - while (xpe) + + for (xpe = abs->xp_elements; xpe; xpe = xpe->next) + xpe->match_state = -1; /* don't know if it matches yet */ + + for (xpe = abs->xp_elements; xpe; xpe = xpe->next) { int i; - ok = dfa_match_first(xpe->dfa->states, pexpr); - if (ok) - yaz_log(LOG_DEBUG, " xpath got match %s",xpe->xpath_expr); - else - yaz_log(LOG_DEBUG, " xpath no match %s",xpe->xpath_expr); + int ok = xpe->match_state; + if (ok == -1) + { /* don't know whether there is a match yet */ + data1_xpelement *xpe1; + + assert(xpe->dfa); + ok = dfa_match_first(xpe->dfa->states, pexpr); +#if OPTIMIZE_MELM + /* mark this and following ones with same regexp */ + for (xpe1 = xpe; xpe1; xpe1 = xpe1->next) + { + if (!strcmp(xpe1->regexp, xpe->regexp)) + xpe1->match_state = ok; + } +#endif + } + assert (ok == 0 || ok == 1); if (ok) { #ifdef ENHANCED_XELM /* we have to check the perdicates up to the root node */ @@ -492,13 +506,12 @@ data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) break; } } - xpe = xpe->next; } xfree(pexpr); - if (ok) { - yaz_log(LOG_DEBUG,"Got it"); + if (xpe) { + yaz_log(LOG_DEBUG,"Got it"); return xpe->termlists; } else { return NULL;