ICU chain of normalizers and tokenizers half-way implemented
[pazpar2-moved-to-github.git] / src / icu_I18N.h
index 803d89b..2746f07 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.h,v 1.7 2007-05-07 12:52:04 marc Exp $
+/* $Id: icu_I18N.h,v 1.12 2007-05-14 13:51:24 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
    This file is part of Pazpar2.
 //#include <unicode/ucnv.h>     /* C   Converter API    */
 //#include <unicode/ustring.h>  /* some more string fcns*/
 //#include <unicode/uloc.h>
-//#include <unicode/ubrk.h>
+#include <unicode/ubrk.h>
 //#include <unicode/unistr.h>
+#include <unicode/utrans.h>
 
 
+
+// declared structs and functions
+
 int icu_check_status (UErrorCode status);
 
 struct icu_buf_utf16
@@ -51,6 +55,8 @@ struct icu_buf_utf16
 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
                                             size_t capacity);
+struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
+                                          struct icu_buf_utf16 * src16);
 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
 
 
@@ -91,6 +97,132 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
                                    struct icu_buf_utf16 * src16,
                                    UErrorCode * status);
 
+struct icu_tokenizer
+{
+  char locale[16];
+  char action;
+  UBreakIterator* bi;
+  struct icu_buf_utf16 * buf16;
+  int32_t token_count;
+  int32_t token_id;
+  int32_t token_start;
+  int32_t token_end;
+  // keep always invariant
+  // 0 <= token_start 
+  //   <= token_end 
+  //   <= buf16->utf16_len
+  // and invariant
+  // 0 <= token_id <= token_count
+};
+
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+                                            UErrorCode *status);
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
+                         struct icu_buf_utf16 * src16, UErrorCode *status);
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
+                                 struct icu_buf_utf16 * tkn16, 
+                                 UErrorCode *status);
+
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
+
+
+
+struct icu_normalizer
+{
+  char action;
+  struct icu_buf_utf16 * rules16;
+  UParseError parse_error[256];
+  UTransliterator * trans;
+};
+
+struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
+                                              UErrorCode *status);
+
+
+void icu_normalizer_destroy(struct icu_normalizer * normalizer);
+
+int icu_normalizer_normalize(struct icu_normalizer * normalizer,
+                             struct icu_buf_utf16 * dest16,
+                             struct icu_buf_utf16 * src16,
+                             UErrorCode *status);
+
+
+#if 0
+struct icu_token
+{
+  int32_t token_id;
+  uint8_t * display8;
+  uint8_t * norm8;
+  uint8_t * sort8;
+}
+#endif
+
+enum icu_chain_step_type {
+    ICU_chain_step_type_none,      // 
+    ICU_chain_step_type_display,   // convert to utf8 display format 
+    ICU_chain_step_type_norm,      // convert to utf8 norm format 
+    ICU_chain_step_type_sort,      // convert to utf8 sort format 
+    ICU_chain_step_type_charmap,   // apply utf16 charmap
+    ICU_chain_step_type_normalize, // apply utf16 normalization
+    ICU_chain_step_type_tokenize   // apply utf16 tokenization 
+};
+
+
+
+struct icu_chain_step
+{
+  // type and action object
+  enum icu_chain_step_type type;
+  union {
+    struct icu_normalizer * normalizer;
+    struct icu_tokenizer * tokenizer;  
+  } u;
+  // temprary post-action utf16 buffer
+  struct icu_buf_utf16 * buf16;  
+  struct icu_chain_step * next;
+};
+
+
+struct icu_chain
+{
+  uint8_t identifier[128];
+  uint8_t locale[16];
+
+  // number of tokens returned so far
+  int32_t token_count;
+
+  // utf8 output buffers
+  struct icu_buf_utf8 * display8;
+  struct icu_buf_utf8 * norm8;
+  struct icu_buf_utf8 * sort8;
+
+  // utf16 source buffer
+  struct icu_buf_utf16 * src16;
+
+  // linked list of chain steps
+  struct icu_chain_step * steps;
+};
+
+struct icu_chain * icu_chain_create(const uint8_t * identifier, 
+                                    const uint8_t * locale);
+
+void icu_chain_destroy(struct icu_chain * chain);
+
+struct icu_chain_step * icu_chain_append_step(struct icu_chain * chain,
+                                              enum icu_chain_step_type type,
+                                              const uint8_t * rule);
+
+void icu_chain_step_destroy(struct icu_chain_step * step);
+
+
 
 #endif // HAVE_ICU
 #endif // ICU_I18NL_H