1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2009 Index Data
3 * See the file LICENSE for details.
16 #include <yaz/xmalloc.h>
18 #include <yaz/icu_I18N.h>
26 #include <unicode/ustring.h> /* some more string fcns*/
27 #include <unicode/uchar.h> /* char names */
29 enum icu_chain_step_type {
30 ICU_chain_step_type_none,
31 ICU_chain_step_type_display, /* convert to utf8 display format */
32 ICU_chain_step_type_casemap, /* apply utf16 charmap */
33 ICU_chain_step_type_transform, /* apply utf16 transform */
34 ICU_chain_step_type_tokenize, /* apply utf16 tokenization */
35 ICU_chain_step_type_transliterate /* apply utf16 tokenization */
40 /* type and action object */
41 enum icu_chain_step_type type;
43 struct icu_casemap * casemap;
44 struct icu_transform * transform;
45 struct icu_tokenizer * tokenizer;
47 /* temprary post-action utf16 buffer */
48 struct icu_buf_utf16 * buf16;
49 struct icu_chain_step * previous;
59 const char * src8cstr;
63 /* number of tokens returned so far */
66 /* utf8 output buffers */
67 struct icu_buf_utf8 * display8;
68 struct icu_buf_utf8 * norm8;
69 struct icu_buf_utf8 * sort8;
71 /* utf16 source buffer */
72 struct icu_buf_utf16 * src16;
74 /* linked list of chain steps */
75 struct icu_chain_step * steps;
78 int icu_check_status(UErrorCode status)
80 if (U_FAILURE(status))
82 yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
88 static struct icu_chain_step *icu_chain_step_create(
89 struct icu_chain * chain, enum icu_chain_step_type type,
90 const uint8_t * rule, struct icu_buf_utf16 * buf16,
93 struct icu_chain_step * step = 0;
95 if(!chain || !type || !rule)
98 step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
104 /* create auxilary objects */
107 case ICU_chain_step_type_display:
109 case ICU_chain_step_type_casemap:
110 step->u.casemap = icu_casemap_create(rule[0], status);
112 case ICU_chain_step_type_transform:
113 /* rule omitted. Only ID used */
114 step->u.transform = icu_transform_create((const char *) rule, 'f',
117 case ICU_chain_step_type_tokenize:
118 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
119 (char) rule[0], status);
121 case ICU_chain_step_type_transliterate:
122 /* we pass a dummy ID to utrans_openU.. */
123 step->u.transform = icu_transform_create("custom", 'f',
124 (const char *) rule, status);
133 static void icu_chain_step_destroy(struct icu_chain_step * step)
138 icu_chain_step_destroy(step->previous);
142 case ICU_chain_step_type_display:
144 case ICU_chain_step_type_casemap:
145 icu_casemap_destroy(step->u.casemap);
146 icu_buf_utf16_destroy(step->buf16);
148 case ICU_chain_step_type_transform:
149 case ICU_chain_step_type_transliterate:
150 icu_transform_destroy(step->u.transform);
151 icu_buf_utf16_destroy(step->buf16);
153 case ICU_chain_step_type_tokenize:
154 icu_tokenizer_destroy(step->u.tokenizer);
155 icu_buf_utf16_destroy(step->buf16);
163 struct icu_chain *icu_chain_create(const char *locale, int sort,
166 struct icu_chain * chain
167 = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
169 *status = U_ZERO_ERROR;
171 chain->locale = xstrdup(locale);
175 chain->coll = ucol_open((const char *) chain->locale, status);
177 if (U_FAILURE(*status))
180 chain->token_count = 0;
184 chain->display8 = icu_buf_utf8_create(0);
185 chain->norm8 = icu_buf_utf8_create(0);
186 chain->sort8 = icu_buf_utf8_create(0);
188 chain->src16 = icu_buf_utf16_create(0);
195 void icu_chain_destroy(struct icu_chain * chain)
200 ucol_close(chain->coll);
202 icu_buf_utf8_destroy(chain->display8);
203 icu_buf_utf8_destroy(chain->norm8);
204 icu_buf_utf8_destroy(chain->sort8);
206 icu_buf_utf16_destroy(chain->src16);
208 icu_chain_step_destroy(chain->steps);
209 xfree(chain->locale);
214 static struct icu_chain_step *icu_chain_insert_step(
215 struct icu_chain * chain, enum icu_chain_step_type type,
216 const uint8_t * rule, UErrorCode *status);
218 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
223 struct icu_chain * chain = 0;
225 *status = U_ZERO_ERROR;
227 if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
231 xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
232 (xmlChar *) "locale");
236 chain = icu_chain_create((const char *) xml_locale, sort, status);
244 for (node = xml_node->children; node; node = node->next)
247 struct icu_chain_step * step = 0;
249 if (node->type != XML_ELEMENT_NODE)
252 xml_rule = xmlGetProp(node, (xmlChar *) "rule");
254 if (!strcmp((const char *) node->name, "casemap"))
255 step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
256 (const uint8_t *) xml_rule, status);
257 else if (!strcmp((const char *) node->name, "transform"))
258 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
259 (const uint8_t *) xml_rule, status);
260 else if (!strcmp((const char *) node->name, "transliterate"))
261 step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
262 (const uint8_t *) xml_rule, status);
263 else if (!strcmp((const char *) node->name, "tokenize"))
264 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
265 (const uint8_t *) xml_rule, status);
266 else if (!strcmp((const char *) node->name, "display"))
267 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
268 (const uint8_t *) "", status);
269 else if (!strcmp((const char *) node->name, "normalize"))
271 yaz_log(YLOG_WARN, "Element %s is deprecated. "
272 "Use transform instead", node->name);
273 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
274 (const uint8_t *) xml_rule, status);
276 else if (!strcmp((const char *) node->name, "index")
277 || !strcmp((const char *) node->name, "sortkey"))
279 yaz_log(YLOG_WARN, "Element %s is no longer needed. "
280 "Remove it from the configuration", node->name);
284 yaz_log(YLOG_WARN, "Unknown element %s", node->name);
285 icu_chain_destroy(chain);
289 if (step && U_FAILURE(*status))
291 icu_chain_destroy(chain);
298 static struct icu_chain_step *icu_chain_insert_step(
299 struct icu_chain * chain, enum icu_chain_step_type type,
300 const uint8_t * rule, UErrorCode *status)
302 struct icu_chain_step * step = 0;
303 struct icu_buf_utf16 * src16 = 0;
304 struct icu_buf_utf16 * buf16 = 0;
306 if (!chain || !type || !rule)
309 /* assign utf16 src buffers as needed */
310 if (chain->steps && chain->steps->buf16)
311 src16 = chain->steps->buf16;
312 else if (chain->src16)
313 src16 = chain->src16;
317 /* create utf16 destination buffers as needed, or */
320 case ICU_chain_step_type_display:
323 case ICU_chain_step_type_casemap:
324 buf16 = icu_buf_utf16_create(0);
326 case ICU_chain_step_type_transform:
327 case ICU_chain_step_type_transliterate:
328 buf16 = icu_buf_utf16_create(0);
330 case ICU_chain_step_type_tokenize:
331 buf16 = icu_buf_utf16_create(0);
337 /* create actual chain step with this buffer */
338 step = icu_chain_step_create(chain, type, rule, buf16, status);
340 step->previous = chain->steps;
346 static int icu_chain_step_next_token(struct icu_chain * chain,
347 struct icu_chain_step * step,
350 struct icu_buf_utf16 * src16 = 0;
351 int got_new_token = 0;
353 if (!chain || !chain->src16 || !step || !step->more_tokens)
356 /* assign utf16 src buffers as needed, advance in previous steps
357 tokens until non-zero token met, and setting stop condition */
361 src16 = step->previous->buf16;
362 /* tokens might be killed in previous steps, therefore looping */
364 while (step->need_new_token
365 && step->previous->more_tokens
368 = icu_chain_step_next_token(chain, step->previous, status);
371 { /* first step can only work once on chain->src16 input buffer */
372 src16 = chain->src16;
373 step->more_tokens = 0;
380 /* stop if nothing to process */
381 if (step->need_new_token && !got_new_token)
383 step->more_tokens = 0;
387 /* either an old token not finished yet, or a new token, thus
388 perform the work, eventually put this steps output in
389 step->buf16 or the chains UTF8 output buffers */
393 case ICU_chain_step_type_display:
394 icu_utf16_to_utf8(chain->display8, src16, status);
396 case ICU_chain_step_type_casemap:
397 icu_casemap_casemap(step->u.casemap,
398 step->buf16, src16, status,
401 case ICU_chain_step_type_transform:
402 case ICU_chain_step_type_transliterate:
403 icu_transform_trans(step->u.transform,
404 step->buf16, src16, status);
406 case ICU_chain_step_type_tokenize:
407 /* attach to new src16 token only first time during splitting */
408 if (step->need_new_token)
410 icu_tokenizer_attach(step->u.tokenizer, src16, status);
411 step->need_new_token = 0;
414 /* splitting one src16 token into multiple buf16 tokens */
416 = icu_tokenizer_next_token(step->u.tokenizer,
417 step->buf16, status);
419 /* make sure to get new previous token if this one had been used up
420 by recursive call to _same_ step */
422 if (!step->more_tokens)
424 step->more_tokens = icu_chain_step_next_token(chain, step, status);
425 return step->more_tokens; /* avoid one token count too much! */
433 if (U_FAILURE(*status))
436 /* if token disappered into thin air, tell caller */
437 /* if (!step->buf16->utf16_len && !step->more_tokens) */
443 int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
446 struct icu_chain_step * stp = 0;
448 if (!chain || !src8cstr)
451 chain->src8cstr = src8cstr;
455 /* clear token count */
456 chain->token_count = 0;
458 /* clear all steps stop states */
461 stp->more_tokens = 1;
462 stp->need_new_token = 1;
466 /* finally convert UTF8 to UTF16 string if needed */
467 if (chain->steps || chain->sort)
468 icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
470 if (U_FAILURE(*status))
476 int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
480 *status = U_ZERO_ERROR;
485 /* special case with no steps - same as index type binary */
488 if (chain->token_count)
492 chain->token_count++;
495 icu_sortkey8_from_utf16(chain->coll,
496 chain->sort8, chain->steps->buf16,
498 return chain->token_count;
501 /* usual case, one or more icu chain steps existing */
504 while (!got_token && chain->steps && chain->steps->more_tokens)
505 got_token = icu_chain_step_next_token(chain, chain->steps, status);
509 chain->token_count++;
511 icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
514 icu_sortkey8_from_utf16(chain->coll,
515 chain->sort8, chain->steps->buf16,
517 return chain->token_count;
524 int icu_chain_token_number(struct icu_chain * chain)
529 return chain->token_count;
532 const char * icu_chain_token_display(struct icu_chain * chain)
535 return icu_buf_utf8_to_cstr(chain->display8);
540 const char * icu_chain_token_norm(struct icu_chain * chain)
543 return chain->src8cstr;
546 return icu_buf_utf8_to_cstr(chain->norm8);
551 const char * icu_chain_token_sortkey(struct icu_chain * chain)
554 return icu_buf_utf8_to_cstr(chain->sort8);
559 #endif /* YAZ_HAVE_ICU */
564 * c-file-style: "Stroustrup"
565 * indent-tabs-mode: nil
567 * vim: shiftwidth=4 tabstop=8 expandtab