From: Adam Dickmeiss Date: Thu, 10 May 2007 11:46:09 +0000 (+0000) Subject: Factor relevance charset normalization out to a separate implementation X-Git-Tag: PAZPAR2.1.0.0~162 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;ds=inline;h=27cfb6d89ca9b02f63f8334b6b8e666cf7db2ff7;p=pazpar2-moved-to-github.git Factor relevance charset normalization out to a separate implementation in charsets.c. --- diff --git a/src/Makefile.am b/src/Makefile.am index 1bcc3cf..fde2a7f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,4 @@ -# $Id: Makefile.am,v 1.19 2007-04-30 13:56:52 marc Exp $ +# $Id: Makefile.am,v 1.20 2007-05-10 11:46:09 adam Exp $ bin_PROGRAMS = pazpar2 check_PROGRAMS = test_config \ @@ -28,6 +28,7 @@ libpazpar2_a_SOURCES = config.c config.h eventl.c eventl.h \ normalize7bit.h normalize7bit.c \ util.c util.h zeerex.c zeerex.h database.c database.h \ settings.h settings.c sel_thread.c sel_thread.h getaddrinfo.c \ + charsets.c charsets.h \ client.c client.h connection.c connection.h host.h parameters.h pazpar2_SOURCES = pazpar2.c diff --git a/src/charsets.c b/src/charsets.c new file mode 100644 index 0000000..3f45baa --- /dev/null +++ b/src/charsets.c @@ -0,0 +1,126 @@ +/* $Id: charsets.c,v 1.1 2007-05-10 11:46:09 adam Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + */ + +/** \file charsets.c + \brief Pazpar2 Character set facilities +*/ + +#if HAVE_CONFIG_H +#include "cconfig.h" +#endif + +#include +#include +#include +#include +#include "charsets.h" + +/* charset handle */ +struct pp2_charset_s { + const char *(*token_next_handler)(pp2_relevance_token_t prt); + /* other handlers will come as we see fit */ +}; + +static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); +/* in the future : */ +// static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); + +/* tokenzier handle */ +struct pp2_relevance_token_s { + const char *cp; /* unnormalized buffer we're tokenizing */ + pp2_charset_t pct; /* our main charset handle (type+config) */ + WRBUF norm_str; /* normized string we return (temporarily) */ +}; + +pp2_charset_t pp2_charset_create(void) +{ + pp2_charset_t pct = xmalloc(sizeof(*pct)); + + pct->token_next_handler = pp2_relevance_token_a_to_z; + return pct; +} + +void pp2_charset_destroy(pp2_charset_t pct) +{ + xfree(pct); +} + +pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, + const char *buf) +{ + pp2_relevance_token_t prt = xmalloc(sizeof(*prt)); + + assert(pct); + prt->norm_str = wrbuf_alloc(); + prt->cp = buf; + prt->pct = pct; + return prt; +} + +void pp2_relevance_token_destroy(pp2_relevance_token_t prt) +{ + assert(prt); + wrbuf_destroy(prt->norm_str); + xfree(prt); +} + +const char *pp2_relevance_token_next(pp2_relevance_token_t prt) +{ + assert(prt); + return (prt->pct->token_next_handler)(prt); +} + +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 1 : -1) +/* original tokenizer with our tokenize interface, but we + add +1 to ensure no '\0' are in our string (except for EOF) +*/ +static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) +{ + const char *cp = prt->cp; + int c; + + /* skip white space */ + while (*cp && (c = raw_char(tolower(*cp))) < 0) + cp++; + if (*cp == '\0') + { + prt->cp = cp; + return 0; + } + /* now read the term itself */ + wrbuf_rewind(prt->norm_str); + while (*cp && (c = raw_char(tolower(*cp))) >= 0) + { + wrbuf_putc(prt->norm_str, c); + cp++; + } + prt->cp = cp; + return wrbuf_cstr(prt->norm_str); +} + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/src/charsets.h b/src/charsets.h new file mode 100644 index 0000000..d948811 --- /dev/null +++ b/src/charsets.h @@ -0,0 +1,48 @@ +/* $Id: charsets.h,v 1.1 2007-05-10 11:46:09 adam Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + */ + +/** \file charsets.h + \brief Pazpar2 Character set facilities +*/ + +#ifndef PAZPAR_CHARSETS_H +#define PAZPAR_CHARSETS_H + +typedef struct pp2_charset_s *pp2_charset_t; +typedef struct pp2_relevance_token_s *pp2_relevance_token_t; + +pp2_charset_t pp2_charset_create(void); +void pp2_charset_destroy(pp2_charset_t pct); + +pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, + const char *buf); +void pp2_relevance_token_destroy(pp2_relevance_token_t prt); +const char *pp2_relevance_token_next(pp2_relevance_token_t prt); + +#endif + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/src/client.c b/src/client.c index 10db126..0fd5706 100644 --- a/src/client.c +++ b/src/client.c @@ -1,4 +1,4 @@ -/* $Id: client.c,v 1.2 2007-04-24 07:04:36 adam Exp $ +/* $Id: client.c,v 1.3 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -632,8 +632,9 @@ int client_parse_query(struct client *cl, const char *query) // Initialize relevance structure with query terms char *p[512]; extract_terms(se->nmem, cn, p); - se->relevance = relevance_create(se->nmem, (const char **) p, - se->expected_maxrecs); + se->relevance = relevance_create(client_get_database(cl)->pct, + se->nmem, (const char **) p, + se->expected_maxrecs); } ccl_rpn_delete(cn); diff --git a/src/logic.c b/src/logic.c index 5a014aa..79189e4 100644 --- a/src/logic.c +++ b/src/logic.c @@ -1,4 +1,4 @@ -/* $Id: logic.c,v 1.26 2007-04-27 12:17:04 marc Exp $ +/* $Id: logic.c,v 1.27 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -511,6 +511,7 @@ static void session_init_databases_fun(void *context, struct database *db) new->database = db; new->yaz_marc = 0; + new->pct = pp2_charset_create(); new->map = 0; new->settings = nmem_malloc(se->session_nmem, sizeof(struct settings *) * num); @@ -534,6 +535,8 @@ static void session_database_destroy(struct session_database *sdb) xsltFreeStylesheet(m->stylesheet); if (sdb->yaz_marc) yaz_marc_destroy(sdb->yaz_marc); + if (sdb->pct) + pp2_charset_destroy(sdb->pct); } // Initialize session_database list -- this represents this session's view diff --git a/src/pazpar2.h b/src/pazpar2.h index 43f2226..5ba1584 100644 --- a/src/pazpar2.h +++ b/src/pazpar2.h @@ -1,4 +1,4 @@ -/* $Id: pazpar2.h,v 1.35 2007-04-24 08:03:03 adam Exp $ +/* $Id: pazpar2.h,v 1.36 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -76,6 +76,7 @@ struct database_retrievalmap { // for that session struct session_database { + pp2_charset_t pct; struct database *database; struct setting **settings; yaz_marc_t yaz_marc; diff --git a/src/relevance.c b/src/relevance.c index 9d2f47d..ddf511f 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,4 +1,4 @@ -/* $Id: relevance.c,v 1.12 2007-05-10 09:26:19 adam Exp $ +/* $Id: relevance.c,v 1.13 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -40,14 +40,15 @@ struct relevance struct word_trie *wt; #else struct word_entry *entries; + pp2_charset_t pct; #endif NMEM nmem; }; +#if USE_TRIE #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) -#if USE_TRIE // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -137,6 +138,36 @@ static struct word_trie *build_word_trie(NMEM nmem, const char **terms) return res; } + +// FIXME. The definition of a word is crude here.. should support +// some form of localization mechanism? +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) +{ + while (*words) + { + char c; + int res; + int skipped = 0; + while (*words && (c = raw_char(tolower(*words))) < 0) + words++; + if (!*words) + break; + res = word_trie_match(r->wt, words, &skipped); + if (res) + { + words += skipped; + cluster->term_frequency_vec[res] += multiplier; + } + else + { + while (*words && (c = raw_char(tolower(*words))) >= 0) + words++; + } + cluster->term_frequency_vec[0]++; + } +} + #else struct word_entry { @@ -169,46 +200,51 @@ int word_entry_match(struct word_entry *entries, const char *norm_str) return 0; } -static struct word_entry *build_word_entries(NMEM nmem, +static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, const char **terms) { int termno = 1; /* >0 signals THERE is an entry */ struct word_entry *entries = 0; const char **p = terms; - WRBUF norm_str = wrbuf_alloc(); for (; *p; p++) { - const char *cp = *p; - for (; *cp; cp++) - { - int c = raw_char(*cp); - if (c >= 0) - wrbuf_putc(norm_str, c); - else - { - if (wrbuf_len(norm_str)) - add_word_entry(nmem, &entries, wrbuf_cstr(norm_str), - termno); - wrbuf_rewind(norm_str); - } - } - if (wrbuf_len(norm_str)) - add_word_entry(nmem, &entries, wrbuf_cstr(norm_str), termno); - wrbuf_rewind(norm_str); + pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p); + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + add_word_entry(nmem, &entries, norm_str, termno); + + pp2_relevance_token_destroy(prt); + termno++; } - wrbuf_destroy(norm_str); return entries; } - +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) +{ + pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); + + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + { + int res = word_entry_match(r->entries, norm_str); + if (res) + cluster->term_frequency_vec[res] += multiplier; + cluster->term_frequency_vec[0]++; + } + pp2_relevance_token_destroy(prt); +} #endif -struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) +struct relevance *relevance_create(pp2_charset_t pct, + NMEM nmem, const char **terms, int numrecs) { struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); const char **p; @@ -223,7 +259,8 @@ struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) #if USE_TRIE res->wt = build_word_trie(nmem, terms); #else - res->entries = build_word_entries(nmem, terms); + res->entries = build_word_entries(pct, nmem, terms); + res->pct = pct; #endif return res; } @@ -238,55 +275,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec) } -// FIXME. The definition of a word is crude here.. should support -// some form of localization mechanism? -void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier) -{ -#if !USE_TRIE - WRBUF norm_str = wrbuf_alloc(); -#endif - while (*words) - { - char c; - int res; -#if USE_TRIE - int skipped = 0; -#endif - while (*words && (c = raw_char(tolower(*words))) < 0) - words++; - if (!*words) - return; -#if USE_TRIE - res = word_trie_match(r->wt, words, &skipped); - if (res) - { - words += skipped; - cluster->term_frequency_vec[res] += multiplier; - } - else - { - while (*words && (c = raw_char(tolower(*words))) >= 0) - words++; - } -#else - while (*words && (c = raw_char(tolower(*words))) >= 0) - { - wrbuf_putc(norm_str, c); - words++; - } - res = word_entry_match(r->entries, wrbuf_cstr(norm_str)); - if (res) - cluster->term_frequency_vec[res] += multiplier; - wrbuf_rewind(norm_str); -#endif - cluster->term_frequency_vec[0]++; - } -#if !USE_TRIE - wrbuf_destroy(norm_str); -#endif -} - void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) { int i; diff --git a/src/relevance.h b/src/relevance.h index 231fbc5..b9da669 100644 --- a/src/relevance.h +++ b/src/relevance.h @@ -1,4 +1,4 @@ -/* $Id: relevance.h,v 1.5 2007-04-23 21:05:23 adam Exp $ +/* $Id: relevance.h,v 1.6 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -23,12 +23,14 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #define RELEVANCE_H #include +#include "charsets.h" struct relevance; struct record_cluster; struct reclist; -struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs); +struct relevance *relevance_create(pp2_charset_t pct, + NMEM nmem, const char **terms, int numrecs); void relevance_newrec(struct relevance *r, struct record_cluster *cluster); void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, int multiplier);