From 39b063c2816d8d221a0af5b6852af78035df7e4a Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Wed, 23 May 2007 14:44:18 +0000 Subject: [PATCH] First ICU chain integration in relevance ranking of pazpar2. Tokenization not working correctly, need more debugging. --- src/charsets.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- src/charsets.h | 7 +++-- src/logic.c | 14 ++++++++-- 3 files changed, 91 insertions(+), 9 deletions(-) diff --git a/src/charsets.c b/src/charsets.c index 3f45baa..2eb2b01 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -1,4 +1,4 @@ -/* $Id: charsets.c,v 1.1 2007-05-10 11:46:09 adam Exp $ +/* $Id: charsets.c,v 1.2 2007-05-23 14:44:18 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -29,19 +29,33 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include +#include #include #include + #include "charsets.h" +//#include "config.h" +//#include "parameters.h" + +#ifdef HAVE_ICU +#include "icu_I18N.h" +#endif // HAVE_ICU /* charset handle */ struct pp2_charset_s { const char *(*token_next_handler)(pp2_relevance_token_t prt); /* other handlers will come as we see fit */ +#ifdef HAVE_ICU + struct icu_chain * icu_chn; + UErrorCode icu_sts; +#endif // HAVE_ICU }; static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); -/* in the future : */ -// static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); + +#ifdef HAVE_ICU +static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); +#endif // HAVE_ICU /* tokenzier handle */ struct pp2_relevance_token_s { @@ -50,11 +64,24 @@ struct pp2_relevance_token_s { WRBUF norm_str; /* normized string we return (temporarily) */ }; -pp2_charset_t pp2_charset_create(void) +pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn) { pp2_charset_t pct = xmalloc(sizeof(*pct)); +#ifdef HAVE_ICU + if (icu_chn){ + pct->icu_chn = icu_chn; + pct->icu_sts = U_ZERO_ERROR; + pct->token_next_handler = pp2_relevance_token_icu; + } + else { + pct->icu_chn = 0; + pct->token_next_handler = pp2_relevance_token_a_to_z; + } +#else // HAVE_ICU pct->token_next_handler = pp2_relevance_token_a_to_z; +#endif // HAVE_ICU + return pct; } @@ -69,16 +96,37 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, pp2_relevance_token_t prt = xmalloc(sizeof(*prt)); assert(pct); + +#ifdef HAVE_ICU + if (pct->icu_chn){ + pct->icu_sts = U_ZERO_ERROR; + int ok = 0; + ok = icu_chain_assign_cstr(pct->icu_chn, buf, &pct->icu_sts); + printf("\nfield ok: %d '%s'\n", ok, buf); + //prt->cp = buf; + prt->pct = pct; + prt->norm_str = 0; + return prt; + } + else { +#endif // HAVE_ICU + prt->norm_str = wrbuf_alloc(); prt->cp = buf; prt->pct = pct; return prt; + +#ifdef HAVE_ICU + } +#endif // HAVE_ICU } + void pp2_relevance_token_destroy(pp2_relevance_token_t prt) { assert(prt); - wrbuf_destroy(prt->norm_str); + if(prt->norm_str) + wrbuf_destroy(prt->norm_str); xfree(prt); } @@ -117,6 +165,27 @@ static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) } +#ifdef HAVE_ICU +static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) +{ + //&& U_SUCCESS(pct->icu_sts)) + if (icu_chain_next_token(prt->pct->icu_chn, &prt->pct->icu_sts)){ + printf("'%s' ", icu_chain_get_norm(prt->pct->icu_chn)); + if (U_FAILURE(prt->pct->icu_sts)) + { + printf("ICU status failure\n "); + return 0; + } + + return icu_chain_get_norm(prt->pct->icu_chn); + } + + return 0; +}; +#endif // HAVE_ICU + + + /* * Local variables: * c-basic-offset: 4 diff --git a/src/charsets.h b/src/charsets.h index d948811..9e350e7 100644 --- a/src/charsets.h +++ b/src/charsets.h @@ -1,4 +1,4 @@ -/* $Id: charsets.h,v 1.1 2007-05-10 11:46:09 adam Exp $ +/* $Id: charsets.h,v 1.2 2007-05-23 14:44:18 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -26,10 +26,13 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #ifndef PAZPAR_CHARSETS_H #define PAZPAR_CHARSETS_H + +struct icu_chain; + typedef struct pp2_charset_s *pp2_charset_t; typedef struct pp2_relevance_token_s *pp2_relevance_token_t; -pp2_charset_t pp2_charset_create(void); +pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn); void pp2_charset_destroy(pp2_charset_t pct); pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, diff --git a/src/logic.c b/src/logic.c index 2a39b42..261febc 100644 --- a/src/logic.c +++ b/src/logic.c @@ -1,4 +1,4 @@ -/* $Id: logic.c,v 1.30 2007-05-17 22:56:41 jakub Exp $ +/* $Id: logic.c,v 1.31 2007-05-23 14:44:18 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -493,7 +493,17 @@ static void session_init_databases_fun(void *context, struct database *db) new->database = db; new->yaz_marc = 0; - new->pct = pp2_charset_create(); + +#ifdef HAVE_ICU + if (global_parameters.server && global_parameters.server->icu_chn) + new->pct + = pp2_charset_create(global_parameters.server->icu_chn); + else + new->pct = pp2_charset_create(0); +#else // HAVE_ICU + new->pct = pp2_charset_create(0); +#endif // HAVE_ICU + new->map = 0; new->settings = nmem_malloc(se->session_nmem, sizeof(struct settings *) * num); -- 1.7.10.4