1 /* This file is part of Pazpar2.
2 Copyright (C) 2006-2008 Index Data
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
28 #include <unicode/utypes.h> /* Basic ICU data types */
29 #include <unicode/uchar.h> /* char names */
31 //#include <unicode/ustdio.h>
32 #include <unicode/ucol.h>
33 //#include <unicode/ucnv.h> /* C Converter API */
34 //#include <unicode/ustring.h> /* some more string fcns*/
35 //#include <unicode/uloc.h>
36 #include <unicode/ubrk.h>
37 //#include <unicode/unistr.h>
38 #include <unicode/utrans.h>
42 // declared structs and functions
44 int icu_check_status (UErrorCode status);
53 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
54 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
56 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
57 struct icu_buf_utf16 * src16);
58 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
69 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
70 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
72 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
75 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
76 struct icu_buf_utf8 * src8,
79 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
80 const char * src8cstr,
84 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
85 struct icu_buf_utf16 * src16,
94 struct icu_casemap * icu_casemap_create(const char *locale, char action,
97 void icu_casemap_destroy(struct icu_casemap * casemap);
99 int icu_casemap_casemap(struct icu_casemap * casemap,
100 struct icu_buf_utf16 * dest16,
101 struct icu_buf_utf16 * src16,
104 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
105 struct icu_buf_utf16 * src16,
106 const char *locale, char action,
109 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
110 struct icu_buf_utf8 * dest8,
111 struct icu_buf_utf16 * src16,
112 UErrorCode * status);
119 struct icu_buf_utf16 * buf16;
124 // keep always invariant
127 // <= buf16->utf16_len
129 // 0 <= token_id <= token_count
132 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
135 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
137 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
138 struct icu_buf_utf16 * src16, UErrorCode *status);
140 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
141 struct icu_buf_utf16 * tkn16,
144 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
145 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
146 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
147 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
148 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
152 struct icu_normalizer
155 struct icu_buf_utf16 * rules16;
156 UParseError parse_error[256];
157 UTransliterator * trans;
160 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
164 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
166 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
167 struct icu_buf_utf16 * dest16,
168 struct icu_buf_utf16 * src16,
183 enum icu_chain_step_type {
184 ICU_chain_step_type_none, //
185 ICU_chain_step_type_display, // convert to utf8 display format
186 ICU_chain_step_type_index, // convert to utf8 index format
187 ICU_chain_step_type_sortkey, // convert to utf8 sortkey format
188 ICU_chain_step_type_casemap, // apply utf16 charmap
189 ICU_chain_step_type_normalize, // apply utf16 normalization
190 ICU_chain_step_type_tokenize // apply utf16 tokenization
195 struct icu_chain_step
197 // type and action object
198 enum icu_chain_step_type type;
200 struct icu_casemap * casemap;
201 struct icu_normalizer * normalizer;
202 struct icu_tokenizer * tokenizer;
204 // temprary post-action utf16 buffer
205 struct icu_buf_utf16 * buf16;
206 struct icu_chain_step * previous;
214 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
215 enum icu_chain_step_type type,
216 const uint8_t * rule,
217 struct icu_buf_utf16 * buf16,
221 void icu_chain_step_destroy(struct icu_chain_step * step);
226 uint8_t identifier[128];
229 // number of tokens returned so far
232 // utf8 output buffers
233 struct icu_buf_utf8 * display8;
234 struct icu_buf_utf8 * norm8;
235 struct icu_buf_utf8 * sort8;
237 // utf16 source buffer
238 struct icu_buf_utf16 * src16;
240 // linked list of chain steps
241 struct icu_chain_step * steps;
244 struct icu_chain * icu_chain_create(const uint8_t * identifier,
245 const uint8_t * locale);
247 void icu_chain_destroy(struct icu_chain * chain);
249 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
250 UErrorCode * status);
253 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
254 enum icu_chain_step_type type,
255 const uint8_t * rule,
259 int icu_chain_step_next_token(struct icu_chain * chain,
260 struct icu_chain_step * step,
263 int icu_chain_assign_cstr(struct icu_chain * chain,
264 const char * src8cstr,
267 int icu_chain_next_token(struct icu_chain * chain,
270 int icu_chain_get_token_count(struct icu_chain * chain);
272 const char * icu_chain_get_display(struct icu_chain * chain);
274 const char * icu_chain_get_norm(struct icu_chain * chain);
276 const char * icu_chain_get_sort(struct icu_chain * chain);
282 #endif // ICU_I18NL_H