1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2009 Index Data
3 * See the file LICENSE for details.
17 #include <yaz/timing.h>
21 #include <yaz/xmalloc.h>
23 #include <yaz/icu_I18N.h>
31 #include <unicode/ustring.h> /* some more string fcns*/
32 #include <unicode/uchar.h> /* char names */
35 #include <unicode/ucol.h>
38 int icu_check_status (UErrorCode status)
40 if (U_FAILURE(status))
42 yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
51 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
53 struct icu_buf_utf16 * buf16
54 = (struct icu_buf_utf16 *) xmalloc(sizeof(struct icu_buf_utf16));
61 buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
62 buf16->utf16[0] = (UChar) 0;
63 buf16->utf16_cap = capacity;
68 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16)
72 buf16->utf16[0] = (UChar) 0;
78 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
85 if (0 == buf16->utf16)
86 buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
89 = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity);
91 icu_buf_utf16_clear(buf16);
92 buf16->utf16_cap = capacity;
105 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
106 struct icu_buf_utf16 * src16)
112 if (dest16->utf16_cap < src16->utf16_len)
113 icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
115 u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
116 dest16->utf16_len = src16->utf16_len;
122 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
131 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
133 struct icu_buf_utf8 * buf8
134 = (struct icu_buf_utf8 *) xmalloc(sizeof(struct icu_buf_utf8));
141 buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
142 buf8->utf8[0] = (uint8_t) 0;
143 buf8->utf8_cap = capacity;
149 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8)
153 buf8->utf8[0] = (uint8_t) 0;
160 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
168 buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
171 = (uint8_t *) xrealloc(buf8->utf8, sizeof(uint8_t) * capacity);
173 buf8->utf8_cap = capacity;
186 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
188 if (!src8 || src8->utf8_len == 0)
191 if (src8->utf8_len == src8->utf8_cap)
192 src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
194 src8->utf8[src8->utf8_len] = '\0';
196 return (const char *) src8->utf8;
200 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
209 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
210 struct icu_buf_utf8 * src8,
213 int32_t utf16_len = 0;
215 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
217 (const char *) src8->utf8, src8->utf8_len, status);
219 /* check for buffer overflow, resize and retry */
220 if (*status == U_BUFFER_OVERFLOW_ERROR)
222 icu_buf_utf16_resize(dest16, utf16_len * 2);
223 *status = U_ZERO_ERROR;
224 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
226 (const char *) src8->utf8, src8->utf8_len, status);
229 if (U_SUCCESS(*status)
230 && utf16_len <= dest16->utf16_cap)
231 dest16->utf16_len = utf16_len;
233 icu_buf_utf16_clear(dest16);
240 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
241 const char * src8cstr,
244 size_t src8cstr_len = 0;
245 int32_t utf16_len = 0;
247 *status = U_ZERO_ERROR;
248 src8cstr_len = strlen(src8cstr);
250 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
252 src8cstr, src8cstr_len, status);
254 /* check for buffer overflow, resize and retry */
255 if (*status == U_BUFFER_OVERFLOW_ERROR)
257 icu_buf_utf16_resize(dest16, utf16_len * 2);
258 *status = U_ZERO_ERROR;
259 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
261 src8cstr, src8cstr_len, status);
264 if (U_SUCCESS(*status)
265 && utf16_len <= dest16->utf16_cap)
266 dest16->utf16_len = utf16_len;
268 icu_buf_utf16_clear(dest16);
276 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
277 struct icu_buf_utf16 * src16,
280 int32_t utf8_len = 0;
282 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
284 src16->utf16, src16->utf16_len, status);
286 /* check for buffer overflow, resize and retry */
287 if (*status == U_BUFFER_OVERFLOW_ERROR)
289 icu_buf_utf8_resize(dest8, utf8_len * 2);
290 *status = U_ZERO_ERROR;
291 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
293 src16->utf16, src16->utf16_len, status);
297 if (U_SUCCESS(*status)
298 && utf8_len <= dest8->utf8_cap)
299 dest8->utf8_len = utf8_len;
301 icu_buf_utf8_clear(dest8);
308 struct icu_casemap * icu_casemap_create(char action, UErrorCode *status)
310 struct icu_casemap * casemap
311 = (struct icu_casemap *) xmalloc(sizeof(struct icu_casemap));
312 casemap->action = action;
314 switch(casemap->action) {
325 icu_casemap_destroy(casemap);
332 void icu_casemap_destroy(struct icu_casemap * casemap)
338 int icu_casemap_casemap(struct icu_casemap * casemap,
339 struct icu_buf_utf16 * dest16,
340 struct icu_buf_utf16 * src16,
347 return icu_utf16_casemap(dest16, src16, locale,
348 casemap->action, status);
352 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
353 struct icu_buf_utf16 * src16,
354 const char *locale, char action,
357 int32_t dest16_len = 0;
360 if (!src16->utf16_len){ /* guarding for empty source string */
362 dest16->utf16[0] = (UChar) 0;
363 dest16->utf16_len = 0;
371 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
372 src16->utf16, src16->utf16_len,
377 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
378 src16->utf16, src16->utf16_len,
383 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
384 src16->utf16, src16->utf16_len,
389 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
390 src16->utf16, src16->utf16_len,
391 U_FOLD_CASE_DEFAULT, status);
395 return U_UNSUPPORTED_ERROR;
399 /* check for buffer overflow, resize and retry */
400 if (*status == U_BUFFER_OVERFLOW_ERROR
401 && dest16 != src16 /* do not resize if in-place conversion */
403 icu_buf_utf16_resize(dest16, dest16_len * 2);
404 *status = U_ZERO_ERROR;
410 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
411 src16->utf16, src16->utf16_len,
416 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
417 src16->utf16, src16->utf16_len,
422 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
423 src16->utf16, src16->utf16_len,
428 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
429 src16->utf16, src16->utf16_len,
430 U_FOLD_CASE_DEFAULT, status);
434 return U_UNSUPPORTED_ERROR;
439 if (U_SUCCESS(*status)
440 && dest16_len <= dest16->utf16_cap)
441 dest16->utf16_len = dest16_len;
444 dest16->utf16[0] = (UChar) 0;
445 dest16->utf16_len = 0;
453 void icu_sortkey8_from_utf16(UCollator *coll,
454 struct icu_buf_utf8 * dest8,
455 struct icu_buf_utf16 * src16,
459 int32_t sortkey_len = 0;
461 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
462 dest8->utf8, dest8->utf8_cap);
464 /* check for buffer overflow, resize and retry */
465 if (sortkey_len > dest8->utf8_cap) {
466 icu_buf_utf8_resize(dest8, sortkey_len * 2);
467 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
468 dest8->utf8, dest8->utf8_cap);
471 if (U_SUCCESS(*status)
473 dest8->utf8_len = sortkey_len;
475 icu_buf_utf8_clear(dest8);
480 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
483 struct icu_tokenizer * tokenizer
484 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
486 tokenizer->action = action;
488 tokenizer->buf16 = 0;
489 tokenizer->token_count = 0;
490 tokenizer->token_id = 0;
491 tokenizer->token_start = 0;
492 tokenizer->token_end = 0;
495 switch(tokenizer->action) {
498 tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
502 tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
506 tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
510 tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
514 tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
517 *status = U_UNSUPPORTED_ERROR;
522 /* ICU error stuff is a very funny business */
523 if (U_SUCCESS(*status))
526 /* freeing if failed */
527 icu_tokenizer_destroy(tokenizer);
531 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
535 ubrk_close(tokenizer->bi);
540 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
541 struct icu_buf_utf16 * src16,
544 if (!tokenizer || !tokenizer->bi || !src16)
548 tokenizer->buf16 = src16;
549 tokenizer->token_count = 0;
550 tokenizer->token_id = 0;
551 tokenizer->token_start = 0;
552 tokenizer->token_end = 0;
554 ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
557 if (U_FAILURE(*status))
563 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
564 struct icu_buf_utf16 * tkn16,
567 int32_t tkn_start = 0;
572 if (!tokenizer || !tokenizer->bi
573 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
577 never change tokenizer->buf16 and keep always invariant
578 0 <= tokenizer->token_start
579 <= tokenizer->token_end
580 <= tokenizer->buf16->utf16_len
581 returns length of token
584 if (0 == tokenizer->token_end) /* first call */
585 tkn_start = ubrk_first(tokenizer->bi);
586 else /* successive calls */
587 tkn_start = tokenizer->token_end;
589 /* get next position */
590 tkn_end = ubrk_next(tokenizer->bi);
592 /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
593 if (UBRK_DONE == tkn_end)
594 tkn_end = tokenizer->buf16->utf16_len;
596 /* copy out if everything is well */
597 if(U_FAILURE(*status))
600 /* everything OK, now update internal state */
601 tkn_len = tkn_end - tkn_start;
604 tokenizer->token_count++;
605 tokenizer->token_id++;
607 tokenizer->token_id = 0;
609 tokenizer->token_start = tkn_start;
610 tokenizer->token_end = tkn_end;
613 /* copying into token buffer if it exists */
615 if (tkn16->utf16_cap < tkn_len)
616 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
618 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
621 tkn16->utf16_len = tkn_len;
628 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
630 return tokenizer->token_id;
633 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
635 return tokenizer->token_start;
638 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
640 return tokenizer->token_end;
643 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
645 return (tokenizer->token_end - tokenizer->token_start);
648 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
650 return tokenizer->token_count;
655 struct icu_transform * icu_transform_create(const char *id, char action,
659 struct icu_buf_utf16 *id16 = icu_buf_utf16_create(0);
660 struct icu_buf_utf16 *rules16 = icu_buf_utf16_create(0);
662 struct icu_transform * transform
663 = (struct icu_transform *) xmalloc(sizeof(struct icu_transform));
665 transform->action = action;
666 transform->trans = 0;
669 icu_utf16_from_utf8_cstr(id16, id, status);
671 icu_utf16_from_utf8_cstr(rules16, rules, status);
673 switch(transform->action)
678 = utrans_openU(id16->utf16,
683 &transform->parse_error, status);
688 = utrans_openU(id16->utf16,
693 &transform->parse_error, status);
696 *status = U_UNSUPPORTED_ERROR;
699 icu_buf_utf16_destroy(rules16);
700 icu_buf_utf16_destroy(id16);
702 if (U_SUCCESS(*status))
705 /* freeing if failed */
706 icu_transform_destroy(transform);
711 void icu_transform_destroy(struct icu_transform * transform){
713 if (transform->trans)
714 utrans_close(transform->trans);
721 int icu_transform_trans(struct icu_transform * transform,
722 struct icu_buf_utf16 * dest16,
723 struct icu_buf_utf16 * src16,
726 if (!transform || !transform->trans
731 if (!src16->utf16_len){ /* guarding for empty source string */
732 icu_buf_utf16_clear(dest16);
736 if (!icu_buf_utf16_copy(dest16, src16))
740 utrans_transUChars (transform->trans,
741 dest16->utf16, &(dest16->utf16_len),
743 0, &(src16->utf16_len), status);
745 if (U_FAILURE(*status))
746 icu_buf_utf16_clear(dest16);
748 return dest16->utf16_len;
754 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
755 enum icu_chain_step_type type,
756 const uint8_t * rule,
757 struct icu_buf_utf16 * buf16,
760 struct icu_chain_step * step = 0;
762 if(!chain || !type || !rule)
765 step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
771 /* create auxilary objects */
773 case ICU_chain_step_type_display:
775 case ICU_chain_step_type_casemap:
776 step->u.casemap = icu_casemap_create(rule[0], status);
778 case ICU_chain_step_type_transform:
779 /* rule omitted. Only ID used */
780 step->u.transform = icu_transform_create((const char *) rule, 'f',
783 case ICU_chain_step_type_tokenize:
784 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
785 (char) rule[0], status);
787 case ICU_chain_step_type_transliterate:
788 /* we pass a dummy ID to utrans_openU.. */
789 step->u.transform = icu_transform_create("custom", 'f',
790 (const char *) rule, status);
800 void icu_chain_step_destroy(struct icu_chain_step * step){
805 icu_chain_step_destroy(step->previous);
808 case ICU_chain_step_type_display:
810 case ICU_chain_step_type_casemap:
811 icu_casemap_destroy(step->u.casemap);
812 icu_buf_utf16_destroy(step->buf16);
814 case ICU_chain_step_type_transform:
815 case ICU_chain_step_type_transliterate:
816 icu_transform_destroy(step->u.transform);
817 icu_buf_utf16_destroy(step->buf16);
819 case ICU_chain_step_type_tokenize:
820 icu_tokenizer_destroy(step->u.tokenizer);
821 icu_buf_utf16_destroy(step->buf16);
831 struct icu_chain * icu_chain_create(const char *locale, int sort,
834 struct icu_chain * chain
835 = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
837 *status = U_ZERO_ERROR;
839 chain->locale = xstrdup(locale);
843 chain->coll = ucol_open((const char *) chain->locale, status);
845 if (U_FAILURE(*status))
848 chain->token_count = 0;
852 chain->display8 = icu_buf_utf8_create(0);
853 chain->norm8 = icu_buf_utf8_create(0);
854 chain->sort8 = icu_buf_utf8_create(0);
856 chain->src16 = icu_buf_utf16_create(0);
864 void icu_chain_destroy(struct icu_chain * chain)
869 ucol_close(chain->coll);
871 icu_buf_utf8_destroy(chain->display8);
872 icu_buf_utf8_destroy(chain->norm8);
873 icu_buf_utf8_destroy(chain->sort8);
875 icu_buf_utf16_destroy(chain->src16);
877 icu_chain_step_destroy(chain->steps);
878 xfree(chain->locale);
885 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
890 struct icu_chain * chain = 0;
892 *status = U_ZERO_ERROR;
894 if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
898 xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
899 (xmlChar *) "locale");
903 chain = icu_chain_create((const char *) xml_locale, sort, status);
911 for (node = xml_node->children; node; node = node->next)
914 struct icu_chain_step * step = 0;
916 if (node->type != XML_ELEMENT_NODE)
919 xml_rule = xmlGetProp(node, (xmlChar *) "rule");
921 if (!strcmp((const char *) node->name, "casemap"))
922 step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
923 (const uint8_t *) xml_rule, status);
924 else if (!strcmp((const char *) node->name, "transform"))
925 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
926 (const uint8_t *) xml_rule, status);
927 else if (!strcmp((const char *) node->name, "transliterate"))
928 step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
929 (const uint8_t *) xml_rule, status);
930 else if (!strcmp((const char *) node->name, "tokenize"))
931 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
932 (const uint8_t *) xml_rule, status);
933 else if (!strcmp((const char *) node->name, "display"))
934 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
935 (const uint8_t *) "", status);
936 else if (!strcmp((const char *) node->name, "normalize"))
938 yaz_log(YLOG_WARN, "Element %s is deprecated. "
939 "Use transform instead", node->name);
940 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
941 (const uint8_t *) xml_rule, status);
943 else if (!strcmp((const char *) node->name, "index")
944 || !strcmp((const char *) node->name, "sortkey"))
946 yaz_log(YLOG_WARN, "Element %s is no longer needed. "
947 "Remove it from the configuration", node->name);
951 yaz_log(YLOG_WARN, "Unknown element %s", node->name);
952 icu_chain_destroy(chain);
956 if (step && U_FAILURE(*status))
958 icu_chain_destroy(chain);
967 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
968 enum icu_chain_step_type type,
969 const uint8_t * rule,
972 struct icu_chain_step * step = 0;
973 struct icu_buf_utf16 * src16 = 0;
974 struct icu_buf_utf16 * buf16 = 0;
976 if (!chain || !type || !rule)
979 /* assign utf16 src buffers as needed */
980 if (chain->steps && chain->steps->buf16)
981 src16 = chain->steps->buf16;
982 else if (chain->src16)
983 src16 = chain->src16;
988 /* create utf16 destination buffers as needed, or */
991 case ICU_chain_step_type_display:
994 case ICU_chain_step_type_casemap:
995 buf16 = icu_buf_utf16_create(0);
997 case ICU_chain_step_type_transform:
998 case ICU_chain_step_type_transliterate:
999 buf16 = icu_buf_utf16_create(0);
1001 case ICU_chain_step_type_tokenize:
1002 buf16 = icu_buf_utf16_create(0);
1009 /* create actual chain step with this buffer */
1010 step = icu_chain_step_create(chain, type, rule, buf16, status);
1012 step->previous = chain->steps;
1013 chain->steps = step;
1019 int icu_chain_step_next_token(struct icu_chain * chain,
1020 struct icu_chain_step * step,
1023 struct icu_buf_utf16 * src16 = 0;
1024 int got_new_token = 0;
1026 if (!chain || !chain->src16 || !step || !step->more_tokens)
1029 /* assign utf16 src buffers as neeed, advance in previous steps
1030 tokens until non-zero token met, and setting stop condition */
1034 src16 = step->previous->buf16;
1035 /* tokens might be killed in previous steps, therefore looping */
1037 while (step->need_new_token
1038 && step->previous->more_tokens
1041 = icu_chain_step_next_token(chain, step->previous, status);
1044 { /* first step can only work once on chain->src16 input buffer */
1045 src16 = chain->src16;
1046 step->more_tokens = 0;
1053 /* stop if nothing to process */
1054 if (step->need_new_token && !got_new_token)
1056 step->more_tokens = 0;
1060 /* either an old token not finished yet, or a new token, thus
1061 perform the work, eventually put this steps output in
1062 step->buf16 or the chains UTF8 output buffers */
1066 case ICU_chain_step_type_display:
1067 icu_utf16_to_utf8(chain->display8, src16, status);
1069 case ICU_chain_step_type_casemap:
1070 icu_casemap_casemap(step->u.casemap,
1071 step->buf16, src16, status,
1074 case ICU_chain_step_type_transform:
1075 case ICU_chain_step_type_transliterate:
1076 icu_transform_trans(step->u.transform,
1077 step->buf16, src16, status);
1079 case ICU_chain_step_type_tokenize:
1080 /* attach to new src16 token only first time during splitting */
1081 if (step->need_new_token)
1083 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1084 step->need_new_token = 0;
1087 /* splitting one src16 token into multiple buf16 tokens */
1089 = icu_tokenizer_next_token(step->u.tokenizer,
1090 step->buf16, status);
1092 /* make sure to get new previous token if this one had been used up
1093 by recursive call to _same_ step */
1095 if (!step->more_tokens)
1097 step->more_tokens = icu_chain_step_next_token(chain, step, status);
1098 return step->more_tokens; /* avoid one token count too much! */
1106 if (U_FAILURE(*status))
1109 /* if token disappered into thin air, tell caller */
1110 /* if (!step->buf16->utf16_len && !step->more_tokens) */
1117 int icu_chain_assign_cstr(struct icu_chain * chain,
1118 const char * src8cstr,
1121 struct icu_chain_step * stp = 0;
1123 if (!chain || !src8cstr)
1126 chain->src8cstr = src8cstr;
1130 /* clear token count */
1131 chain->token_count = 0;
1133 /* clear all steps stop states */
1136 stp->more_tokens = 1;
1137 stp->need_new_token = 1;
1138 stp = stp->previous;
1141 /* finally convert UTF8 to UTF16 string if needed */
1142 if (chain->steps || chain->sort)
1143 icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
1145 if (U_FAILURE(*status))
1153 int icu_chain_next_token(struct icu_chain * chain,
1158 *status = U_ZERO_ERROR;
1163 /* special case with no steps - same as index type binary */
1166 if (chain->token_count)
1170 chain->token_count++;
1173 icu_sortkey8_from_utf16(chain->coll,
1174 chain->sort8, chain->steps->buf16,
1176 return chain->token_count;
1179 /* usual case, one or more icu chain steps existing */
1182 while(!got_token && chain->steps && chain->steps->more_tokens)
1183 got_token = icu_chain_step_next_token(chain, chain->steps, status);
1187 chain->token_count++;
1189 icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
1192 icu_sortkey8_from_utf16(chain->coll,
1193 chain->sort8, chain->steps->buf16,
1196 return chain->token_count;
1203 int icu_chain_token_number(struct icu_chain * chain)
1208 return chain->token_count;
1212 const char * icu_chain_token_display(struct icu_chain * chain)
1214 if (chain->display8)
1215 return icu_buf_utf8_to_cstr(chain->display8);
1220 const char * icu_chain_token_norm(struct icu_chain * chain)
1223 return chain->src8cstr;
1226 return icu_buf_utf8_to_cstr(chain->norm8);
1231 const char * icu_chain_token_sortkey(struct icu_chain * chain)
1234 return icu_buf_utf8_to_cstr(chain->sort8);
1239 const UCollator * icu_chain_get_coll(struct icu_chain * chain)
1244 #endif /* YAZ_HAVE_ICU */
1249 * c-file-style: "Stroustrup"
1250 * indent-tabs-mode: nil
1252 * vim: shiftwidth=4 tabstop=8 expandtab