From: Adam Dickmeiss Date: Tue, 24 Nov 2009 11:19:35 +0000 (+0100) Subject: Metadata 'skiparticle works for ICU normalization X-Git-Tag: v1.2.3~1 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;h=572de728d257c3d2e6b11d0f60713c81b004c406;p=pazpar2-moved-to-github.git Metadata 'skiparticle works for ICU normalization The skiparticle handling was only working for the 7-bit ASCII normalization code before. --- diff --git a/src/charsets.c b/src/charsets.c index 9332aa0..f71f4c8 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #include "charsets.h" #include "normalize7bit.h" @@ -42,7 +43,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /* charset handle */ struct pp2_charset_s { const char *(*token_next_handler)(pp2_relevance_token_t prt); - const char *(*get_sort_handler)(pp2_relevance_token_t prt, int skip); + const char *(*get_sort_handler)(pp2_relevance_token_t prt); int ref_count; #if YAZ_HAVE_ICU struct icu_chain * icu_chn; @@ -51,11 +52,11 @@ struct pp2_charset_s { }; static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); -static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, int skip_article); +static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt); #if YAZ_HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); -static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, int skip_article); +static const char *pp2_get_sort_icu(pp2_relevance_token_t prt); #endif /* tokenzier handle */ @@ -142,12 +143,30 @@ void pp2_charset_destroy(pp2_charset_t pct) } pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, - const char *buf) + const char *buf, + int skip_article) { pp2_relevance_token_t prt = xmalloc(sizeof(*prt)); assert(pct); + if (skip_article) + { + const char *p = buf; + char firstword[64]; + char *pout = firstword; + char articles[] = "the den der die des an a "; // must end in space + + while (*p && !isalnum(*(unsigned char *)p)) + p++; + for (; *p && *p != ' ' && pout - firstword < (sizeof(firstword)-2); p++) + *pout++ = tolower(*(unsigned char *)p); + *pout++ = ' '; + *pout++ = '\0'; + if (strstr(articles, firstword)) + buf = p; + } + prt->norm_str = wrbuf_alloc(); prt->sort_str = wrbuf_alloc(); prt->cp = buf; @@ -184,9 +203,9 @@ const char *pp2_relevance_token_next(pp2_relevance_token_t prt) return (prt->pct->token_next_handler)(prt); } -const char *pp2_get_sort(pp2_relevance_token_t prt, int skip) +const char *pp2_get_sort(pp2_relevance_token_t prt) { - return prt->pct->get_sort_handler(prt, skip); + return prt->pct->get_sort_handler(prt); } #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1) @@ -220,8 +239,7 @@ static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) return wrbuf_cstr(prt->norm_str); } -static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, - int skip_article) +static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt) { if (prt->last_cp == 0) return 0; @@ -229,7 +247,7 @@ static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, { char *tmp = xstrdup(prt->last_cp); char *result = 0; - result = normalize7bit_mergekey(tmp, skip_article); + result = normalize7bit_mergekey(tmp); wrbuf_rewind(prt->sort_str); wrbuf_puts(prt->sort_str, result); @@ -253,8 +271,7 @@ static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) return 0; } -static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, - int skip_article) +static const char *pp2_get_sort_icu(pp2_relevance_token_t prt) { return icu_chain_token_sortkey(prt->pct->icu_chn); } diff --git a/src/charsets.h b/src/charsets.h index cb41404..8d1efba 100644 --- a/src/charsets.h +++ b/src/charsets.h @@ -38,10 +38,11 @@ void pp2_charset_destroy(pp2_charset_t pct); void pp2_charset_incref(pp2_charset_t pct); pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, - const char *buf); + const char *buf, + int skip_article); void pp2_relevance_token_destroy(pp2_relevance_token_t prt); const char *pp2_relevance_token_next(pp2_relevance_token_t prt); -const char *pp2_get_sort(pp2_relevance_token_t prt, int skip_article); +const char *pp2_get_sort(pp2_relevance_token_t prt); #if 0 typedef int pp2_charset_normalize_t(pp2_charset_t pct, diff --git a/src/logic.c b/src/logic.c index 3afa67c..c07d303 100644 --- a/src/logic.c +++ b/src/logic.c @@ -895,7 +895,7 @@ static int get_mergekey_from_doc(xmlDoc *doc, xmlNode *root, const char *name, pp2_relevance_token_t prt = pp2_relevance_tokenize( service->mergekey_pct, - (const char *) value); + (const char *) value, 0); wrbuf_puts(norm_wr, name); wrbuf_puts(norm_wr, "="); @@ -935,7 +935,7 @@ static const char *get_mergekey(xmlDoc *doc, struct client *cl, int record_no, pp2_relevance_token_t prt = pp2_relevance_tokenize( service->mergekey_pct, - (const char *) mergekey); + (const char *) mergekey, 0); while ((norm_str = pp2_relevance_token_next(prt))) { @@ -1197,11 +1197,11 @@ struct record *ingest_record(struct client *cl, const char *rec, prt = pp2_relevance_tokenize( service->sort_pct, - rec_md->data.text.disp); + rec_md->data.text.disp, skip_article); pp2_relevance_token_next(prt); - sort_str = pp2_get_sort(prt, skip_article); + sort_str = pp2_get_sort(prt); cluster->sortkeys[sk_field_id]->text.disp = rec_md->data.text.disp; diff --git a/src/normalize7bit.c b/src/normalize7bit.c index 1f993c9..cfa59bb 100644 --- a/src/normalize7bit.c +++ b/src/normalize7bit.c @@ -44,29 +44,9 @@ char * normalize7bit_generic(char * str, const char * rm_chars) return p; } - - -char * normalize7bit_mergekey(char *buf, int skiparticle) +char *normalize7bit_mergekey(char *buf) { char *p = buf, *pout = buf; - - if (skiparticle) - { - char firstword[64]; - char articles[] = "the den der die des an a "; // must end in space - - while (*p && !isalnum(*(unsigned char *)p)) - p++; - pout = firstword; - while (*p && *p != ' ' && pout - firstword < 62) - *(pout++) = tolower(*(unsigned char *)(p++)); - *(pout++) = ' '; - *(pout++) = '\0'; - if (!strstr(articles, firstword)) - p = buf; - pout = buf; - } - while (*p) { while (*p && !isalnum(*(unsigned char *)p)) diff --git a/src/normalize7bit.h b/src/normalize7bit.h index 4cc51af..72aac93 100644 --- a/src/normalize7bit.h +++ b/src/normalize7bit.h @@ -20,7 +20,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #ifndef NORMALIZE7BIT_H #define NORMALIZE7BIT_H -char *normalize7bit_mergekey(char *buf, int skiparticle); +char *normalize7bit_mergekey(char *buf); char * normalize7bit_generic(char * str, const char * rm_chars); int extract7bit_dates(const char *buf, int *first, int *last, int longdate); diff --git a/src/relevance.c b/src/relevance.c index c88676e..0234d91 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -77,7 +77,7 @@ static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, for (; *p; p++) { - pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p); + pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p, 0); const char *norm_str; while ((norm_str = pp2_relevance_token_next(prt))) @@ -93,7 +93,7 @@ static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, int multiplier, const char *name) { - pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); + pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words, 0); int *mult = cluster->term_frequency_vec_tmp; const char *norm_str; int i, length = 0; diff --git a/test/test_icu_4.res b/test/test_icu_4.res index 899610d..84feb43 100644 --- a/test/test_icu_4.res +++ b/test/test_icu_4.res @@ -38,6 +38,21 @@ +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +XXXXXXXXXX +test-usersetting-2 data: + YYYYYYYYY +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + The Puget Sound Region a portfolio of thematic computer maps 1974 @@ -56,32 +71,16 @@ -The Computer Bible -1973-1980 -Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates -The Computer Bible -1973-1980 -Hebrew and Greek; introductions in English -Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates -XXXXXXXXXX -test-usersetting-2 data: - YYYYYYYYY -title the computer bible author medium book - - - -Reconstruction tomography in diagnostic radiology and nuclear medicine -proceedings of the workshop -1977 -Includes bibliographical references and index -Reconstruction tomography in diagnostic radiology and nuclear medicine -proceedings of the workshop -1977 -Includes bibliographical references and index +A plan for community college computer development +1971 +Cover title +A plan for community college computer development +1971 +Cover title XXXXXXXXXX test-usersetting-2 data: YYYYYYYYY -title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book +title a plan for community college computer development author medium book @@ -131,15 +130,16 @@ -A plan for community college computer development -1971 -Cover title -A plan for community college computer development -1971 -Cover title +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +The Computer Bible +1973-1980 +Hebrew and Greek; introductions in English +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates XXXXXXXXXX test-usersetting-2 data: YYYYYYYYY -title a plan for community college computer development author medium book +title the computer bible author medium book diff --git a/test/test_icu_5.res b/test/test_icu_5.res index c4bc05d..407ac56 100644 --- a/test/test_icu_5.res +++ b/test/test_icu_5.res @@ -8,16 +8,17 @@ 9 -A plan for community college computer development -1971 -Cover title -A plan for community college computer development -1971 -Cover title +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +The Computer Bible +1973-1980 +Hebrew and Greek; introductions in English +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates XXXXXXXXXX test-usersetting-2 data: YYYYYYYYY -title a plan for community college computer development author medium book +title the computer bible author medium book @@ -67,32 +68,16 @@ -Reconstruction tomography in diagnostic radiology and nuclear medicine -proceedings of the workshop -1977 -Includes bibliographical references and index -Reconstruction tomography in diagnostic radiology and nuclear medicine -proceedings of the workshop -1977 -Includes bibliographical references and index -XXXXXXXXXX -test-usersetting-2 data: - YYYYYYYYY -title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book - - - -The Computer Bible -1973-1980 -Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates -The Computer Bible -1973-1980 -Hebrew and Greek; introductions in English -Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +A plan for community college computer development +1971 +Cover title +A plan for community college computer development +1971 +Cover title XXXXXXXXXX test-usersetting-2 data: YYYYYYYYY -title the computer bible author medium book +title a plan for community college computer development author medium book @@ -114,6 +99,21 @@ +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +XXXXXXXXXX +test-usersetting-2 data: + YYYYYYYYY +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + The use of passwords for controlled access to computer resources 1977 Wood, Helen M