-/* $Id: icu_I18N.h,v 1.6 2007-05-07 12:18:34 marc Exp $
- Copyright (c) 2006-2007, Index Data.
-
-This file is part of Pazpar2.
+/* This file is part of Pazpar2.
+ Copyright (C) 2006-2008 Index Data
Pazpar2 is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
for more details.
You should have received a copy of the GNU General Public License
-along with Pazpar2; see the file LICENSE. If not, write to the
-Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA.
- */
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
#ifndef ICU_I18NL_H
#define ICU_I18NL_H
-#ifdef HAVE_ICU
-
#include <yaz/nmem.h>
+#include <libxml/parser.h>
+#include <libxml/tree.h>
#include <unicode/utypes.h> /* Basic ICU data types */
#include <unicode/uchar.h> /* char names */
//#include <unicode/ucnv.h> /* C Converter API */
//#include <unicode/ustring.h> /* some more string fcns*/
//#include <unicode/uloc.h>
-//#include <unicode/ubrk.h>
+#include <unicode/ubrk.h>
//#include <unicode/unistr.h>
+#include <unicode/utrans.h>
+
+// declared structs and functions
+
int icu_check_status (UErrorCode status);
struct icu_buf_utf16
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
size_t capacity);
+struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf16 * src16);
void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
struct icu_buf_utf16 * src16,
UErrorCode * status);
+struct icu_casemap
+{
+ char locale[16];
+ char action;
+};
+
+struct icu_casemap * icu_casemap_create(const char *locale, char action,
+ UErrorCode *status);
+
+void icu_casemap_destroy(struct icu_casemap * casemap);
+
+int icu_casemap_casemap(struct icu_casemap * casemap,
+ struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf16 * src16,
+ UErrorCode *status);
+
+int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf16 * src16,
+ const char *locale, char action,
+ UErrorCode *status);
UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
struct icu_buf_utf8 * dest8,
struct icu_buf_utf16 * src16,
UErrorCode * status);
+struct icu_tokenizer
+{
+ char locale[16];
+ char action;
+ UBreakIterator* bi;
+ struct icu_buf_utf16 * buf16;
+ int32_t token_count;
+ int32_t token_id;
+ int32_t token_start;
+ int32_t token_end;
+ // keep always invariant
+ // 0 <= token_start
+ // <= token_end
+ // <= buf16->utf16_len
+ // and invariant
+ // 0 <= token_id <= token_count
+};
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+ UErrorCode *status);
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * src16, UErrorCode *status);
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * tkn16,
+ UErrorCode *status);
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
-// CRAP to Follow here ...
+struct icu_normalizer
+{
+ char action;
+ struct icu_buf_utf16 * rules16;
+ UParseError parse_error[256];
+ UTransliterator * trans;
+};
+
+struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
+ UErrorCode *status);
+
+
+void icu_normalizer_destroy(struct icu_normalizer * normalizer);
+
+int icu_normalizer_normalize(struct icu_normalizer * normalizer,
+ struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf16 * src16,
+ UErrorCode *status);
+
#if 0
-struct icu_termmap
+struct icu_token
+{
+ int32_t token_id;
+ uint8_t * display8;
+ uint8_t * norm8;
+ uint8_t * sort8;
+}
+#endif
+
+
+enum icu_chain_step_type {
+ ICU_chain_step_type_none, //
+ ICU_chain_step_type_display, // convert to utf8 display format
+ ICU_chain_step_type_index, // convert to utf8 index format
+ ICU_chain_step_type_sortkey, // convert to utf8 sortkey format
+ ICU_chain_step_type_casemap, // apply utf16 charmap
+ ICU_chain_step_type_normalize, // apply utf16 normalization
+ ICU_chain_step_type_tokenize // apply utf16 tokenization
+};
+
+
+
+struct icu_chain_step
{
- char * sort_key; // standard C string '\0' terminated
- char * norm_term; // standard C utf-8 string
- char * disp_term; // standard C utf-8 string
+ // type and action object
+ enum icu_chain_step_type type;
+ union {
+ struct icu_casemap * casemap;
+ struct icu_normalizer * normalizer;
+ struct icu_tokenizer * tokenizer;
+ } u;
+ // temprary post-action utf16 buffer
+ struct icu_buf_utf16 * buf16;
+ struct icu_chain_step * previous;
+ int more_tokens;
+ int need_new_token;
};
-struct icu_termmap * icu_termmap_create(NMEM nmem);
-int icu_termmap_cmp(const void *vp1, const void *vp2);
+struct icu_chain;
+
+struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
+ enum icu_chain_step_type type,
+ const uint8_t * rule,
+ struct icu_buf_utf16 * buf16,
+ UErrorCode *status);
+
+
+void icu_chain_step_destroy(struct icu_chain_step * step);
+
+
+struct icu_chain
+{
+ uint8_t identifier[128];
+ uint8_t locale[16];
+
+ // number of tokens returned so far
+ int32_t token_count;
+
+ // utf8 output buffers
+ struct icu_buf_utf8 * display8;
+ struct icu_buf_utf8 * norm8;
+ struct icu_buf_utf8 * sort8;
+
+ // utf16 source buffer
+ struct icu_buf_utf16 * src16;
+
+ // linked list of chain steps
+ struct icu_chain_step * steps;
+};
+
+struct icu_chain * icu_chain_create(const uint8_t * identifier,
+ const uint8_t * locale);
+
+void icu_chain_destroy(struct icu_chain * chain);
+
+struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
+ UErrorCode * status);
+
+
+struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
+ enum icu_chain_step_type type,
+ const uint8_t * rule,
+ UErrorCode *status);
+
+
+int icu_chain_step_next_token(struct icu_chain * chain,
+ struct icu_chain_step * step,
+ UErrorCode *status);
+
+int icu_chain_assign_cstr(struct icu_chain * chain,
+ const char * src8cstr,
+ UErrorCode *status);
+
+int icu_chain_next_token(struct icu_chain * chain,
+ UErrorCode *status);
+
+int icu_chain_get_token_count(struct icu_chain * chain);
+
+const char * icu_chain_get_display(struct icu_chain * chain);
+
+const char * icu_chain_get_norm(struct icu_chain * chain);
+
+const char * icu_chain_get_sort(struct icu_chain * chain);
-char * icu_casemap(NMEM nmem, char *buf, size_t buf_cap,
- size_t *dest8_len, const char *src8,
- const char *locale, char action);
-char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap,
- size_t *dest8_len, const char *src8,
- const char *locale);
-#endif // 0
-#endif // HAVE_ICU
#endif // ICU_I18NL_H