1 /* $Id: icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
28 #include <yaz/timing.h>
33 #include <yaz/icu_I18N.h>
41 #include <unicode/ustring.h> /* some more string fcns*/
42 #include <unicode/uchar.h> /* char names */
45 //#include <unicode/ustdio.h>
46 //#include <unicode/utypes.h> /* Basic ICU data types */
47 #include <unicode/ucol.h>
48 //#include <unicode/ucnv.h> /* C Converter API */
49 //#include <unicode/uloc.h>
50 //#include <unicode/ubrk.h>
51 /* #include <unicode/unistr.h> */
56 int icu_check_status (UErrorCode status)
58 if(U_FAILURE(status)){
60 "ICU: %d %s\n", status, u_errorName(status));
69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
71 struct icu_buf_utf16 * buf16
72 = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
79 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
80 buf16->utf16[0] = (UChar) 0;
81 buf16->utf16_cap = capacity;
86 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
91 if (0 == buf16->utf16)
92 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
95 = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
96 buf16->utf16[0] = (UChar) 0;
98 buf16->utf16_cap = capacity;
104 buf16->utf16_len = 0;
105 buf16->utf16_cap = 0;
113 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
114 struct icu_buf_utf16 * src16)
120 if (dest16->utf16_cap < src16->utf16_len)
121 icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
123 u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
124 dest16->utf16_len = src16->utf16_len;
130 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
144 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
146 struct icu_buf_utf8 * buf8
147 = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
154 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
155 buf8->utf8[0] = (uint8_t) 0;
156 buf8->utf8_cap = capacity;
163 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
169 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
172 = (uint8_t *) realloc(buf8->utf8,
173 sizeof(uint8_t) * capacity);
174 buf8->utf8_cap = capacity;
189 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
190 struct icu_buf_utf8 * src8)
197 if (dest8->utf8_cap < src8->utf8_len)
198 icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
200 strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
206 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
208 if (!src8 || src8->utf8_len == 0)
210 if (src8->utf8_len == src8->utf8_cap)
211 src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
212 src8->utf8[src8->utf8_len] = '\0';
213 return (const char *) src8->utf8;
217 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
228 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
229 struct icu_buf_utf8 * src8,
232 int32_t utf16_len = 0;
234 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
236 (const char *) src8->utf8, src8->utf8_len, status);
238 // check for buffer overflow, resize and retry
239 if (*status == U_BUFFER_OVERFLOW_ERROR
240 //|| dest16->utf16_len > dest16->utf16_cap
242 icu_buf_utf16_resize(dest16, utf16_len * 2);
243 *status = U_ZERO_ERROR;
244 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
246 (const char *) src8->utf8, src8->utf8_len, status);
249 //if (*status != U_BUFFER_OVERFLOW_ERROR
250 if (U_SUCCESS(*status)
251 && utf16_len <= dest16->utf16_cap)
252 dest16->utf16_len = utf16_len;
254 dest16->utf16[0] = (UChar) 0;
255 dest16->utf16_len = 0;
263 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
264 const char * src8cstr,
267 size_t src8cstr_len = 0;
268 int32_t utf16_len = 0;
270 src8cstr_len = strlen(src8cstr);
272 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
274 src8cstr, src8cstr_len, status);
276 // check for buffer overflow, resize and retry
277 if (*status == U_BUFFER_OVERFLOW_ERROR
278 //|| dest16->utf16_len > dest16->utf16_cap
280 icu_buf_utf16_resize(dest16, utf16_len * 2);
281 *status = U_ZERO_ERROR;
282 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
284 src8cstr, src8cstr_len, status);
287 // if (*status != U_BUFFER_OVERFLOW_ERROR
288 if (U_SUCCESS(*status)
289 && utf16_len <= dest16->utf16_cap)
290 dest16->utf16_len = utf16_len;
292 dest16->utf16[0] = (UChar) 0;
293 dest16->utf16_len = 0;
302 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
303 struct icu_buf_utf16 * src16,
306 int32_t utf8_len = 0;
308 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
310 src16->utf16, src16->utf16_len, status);
312 // check for buffer overflow, resize and retry
313 if (*status == U_BUFFER_OVERFLOW_ERROR
314 //|| dest8->utf8_len > dest8->utf8_cap
316 icu_buf_utf8_resize(dest8, utf8_len * 2);
317 *status = U_ZERO_ERROR;
318 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
320 src16->utf16, src16->utf16_len, status);
324 //if (*status != U_BUFFER_OVERFLOW_ERROR
325 if (U_SUCCESS(*status)
326 && utf8_len <= dest8->utf8_cap)
327 dest8->utf8_len = utf8_len;
329 dest8->utf8[0] = (uint8_t) 0;
338 struct icu_casemap * icu_casemap_create(const char *locale, char action,
341 struct icu_casemap * casemap
342 = (struct icu_casemap *) malloc(sizeof(struct icu_casemap));
343 strcpy(casemap->locale, locale);
344 casemap->action = action;
346 switch(casemap->action) {
356 icu_casemap_destroy(casemap);
363 void icu_casemap_destroy(struct icu_casemap * casemap)
370 int icu_casemap_casemap(struct icu_casemap * casemap,
371 struct icu_buf_utf16 * dest16,
372 struct icu_buf_utf16 * src16,
378 return icu_utf16_casemap(dest16, src16,
379 casemap->locale, casemap->action, status);
383 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
384 struct icu_buf_utf16 * src16,
385 const char *locale, char action,
388 int32_t dest16_len = 0;
392 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
393 src16->utf16, src16->utf16_len,
397 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
398 src16->utf16, src16->utf16_len,
402 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
403 src16->utf16, src16->utf16_len,
407 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
408 src16->utf16, src16->utf16_len,
409 U_FOLD_CASE_DEFAULT, status);
413 return U_UNSUPPORTED_ERROR;
417 // check for buffer overflow, resize and retry
418 if (*status == U_BUFFER_OVERFLOW_ERROR
419 && dest16 != src16 // do not resize if in-place conversion
420 //|| dest16_len > dest16->utf16_cap
422 icu_buf_utf16_resize(dest16, dest16_len * 2);
423 *status = U_ZERO_ERROR;
428 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
429 src16->utf16, src16->utf16_len,
433 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
434 src16->utf16, src16->utf16_len,
438 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
439 src16->utf16, src16->utf16_len,
443 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
444 src16->utf16, src16->utf16_len,
445 U_FOLD_CASE_DEFAULT, status);
449 return U_UNSUPPORTED_ERROR;
454 if (U_SUCCESS(*status)
455 && dest16_len <= dest16->utf16_cap)
456 dest16->utf16_len = dest16_len;
458 dest16->utf16[0] = (UChar) 0;
459 dest16->utf16_len = 0;
467 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
468 struct icu_buf_utf8 * dest8,
469 struct icu_buf_utf16 * src16,
473 int32_t sortkey_len = 0;
475 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
476 dest8->utf8, dest8->utf8_cap);
478 // check for buffer overflow, resize and retry
479 if (sortkey_len > dest8->utf8_cap) {
480 icu_buf_utf8_resize(dest8, sortkey_len * 2);
481 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
482 dest8->utf8, dest8->utf8_cap);
485 if (U_SUCCESS(*status)
487 dest8->utf8_len = sortkey_len;
489 dest8->utf8[0] = (UChar) 0;
498 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
501 struct icu_tokenizer * tokenizer
502 = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
504 strcpy(tokenizer->locale, locale);
505 tokenizer->action = action;
507 tokenizer->buf16 = 0;
508 tokenizer->token_count = 0;
509 tokenizer->token_id = 0;
510 tokenizer->token_start = 0;
511 tokenizer->token_end = 0;
514 switch(tokenizer->action) {
517 = ubrk_open(UBRK_LINE, tokenizer->locale,
522 = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
527 = ubrk_open(UBRK_WORD, tokenizer->locale,
532 = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
537 = ubrk_open(UBRK_TITLE, tokenizer->locale,
541 *status = U_UNSUPPORTED_ERROR;
546 // ICU error stuff is a very funny business
547 if (U_SUCCESS(*status))
551 icu_tokenizer_destroy(tokenizer);
555 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
559 ubrk_close(tokenizer->bi);
564 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
565 struct icu_buf_utf16 * src16,
568 if (!tokenizer || !tokenizer->bi || !src16)
572 tokenizer->buf16 = src16;
573 tokenizer->token_count = 0;
574 tokenizer->token_id = 0;
575 tokenizer->token_start = 0;
576 tokenizer->token_end = 0;
578 ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
581 if (U_FAILURE(*status))
587 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
588 struct icu_buf_utf16 * tkn16,
591 int32_t tkn_start = 0;
596 if (!tokenizer || !tokenizer->bi
597 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
600 // never change tokenizer->buf16 and keep always invariant
601 // 0 <= tokenizer->token_start
602 // <= tokenizer->token_end
603 // <= tokenizer->buf16->utf16_len
604 // returns length of token
606 if (0 == tokenizer->token_end) // first call
607 tkn_start = ubrk_first(tokenizer->bi);
608 else //successive calls
609 tkn_start = tokenizer->token_end;
612 tkn_end = ubrk_next(tokenizer->bi);
614 // repairing invariant at end of ubrk, which is UBRK_DONE = -1
615 if (UBRK_DONE == tkn_end)
616 tkn_end = tokenizer->buf16->utf16_len;
618 // copy out if everything is well
619 if(U_FAILURE(*status))
622 // everything OK, now update internal state
623 tkn_len = tkn_end - tkn_start;
626 tokenizer->token_count++;
627 tokenizer->token_id++;
629 tokenizer->token_id = 0;
631 tokenizer->token_start = tkn_start;
632 tokenizer->token_end = tkn_end;
635 // copying into token buffer if it exists
637 if (tkn16->utf16_cap < tkn_len)
638 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
640 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
643 tkn16->utf16_len = tkn_len;
650 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
652 return tokenizer->token_id;
655 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
657 return tokenizer->token_start;
660 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
662 return tokenizer->token_end;
665 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
667 return (tokenizer->token_end - tokenizer->token_start);
670 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
672 return tokenizer->token_count;
677 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
681 struct icu_normalizer * normalizer
682 = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
684 normalizer->action = action;
685 normalizer->trans = 0;
686 normalizer->rules16 = icu_buf_utf16_create(0);
687 icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
689 switch(normalizer->action) {
692 = utrans_openU(normalizer->rules16->utf16,
693 normalizer->rules16->utf16_len,
696 normalizer->parse_error, status);
697 // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
701 = utrans_openU(normalizer->rules16->utf16,
702 normalizer->rules16->utf16_len,
705 normalizer->parse_error, status);
706 // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
709 *status = U_UNSUPPORTED_ERROR;
714 if (U_SUCCESS(*status))
718 icu_normalizer_destroy(normalizer);
723 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
725 if (normalizer->rules16)
726 icu_buf_utf16_destroy(normalizer->rules16);
727 if (normalizer->trans)
729 // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
730 utrans_close(normalizer->trans);
738 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
739 struct icu_buf_utf16 * dest16,
740 struct icu_buf_utf16 * src16,
743 if (!normalizer || !normalizer->trans || !src16 || !dest16)
746 if (!icu_buf_utf16_copy(dest16, src16))
749 utrans_transUChars (normalizer->trans,
750 dest16->utf16, &(dest16->utf16_len),
752 0, &(src16->utf16_len), status);
754 if (U_FAILURE(*status)){
755 dest16->utf16[0] = (UChar) 0;
756 dest16->utf16_len = 0;
759 return dest16->utf16_len;
765 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
766 enum icu_chain_step_type type,
767 const uint8_t * rule,
768 struct icu_buf_utf16 * buf16,
771 struct icu_chain_step * step = 0;
773 if(!chain || !type || !rule)
776 step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
782 // create auxilary objects
784 case ICU_chain_step_type_display:
786 case ICU_chain_step_type_index:
788 case ICU_chain_step_type_sortkey:
790 case ICU_chain_step_type_casemap:
791 step->u.casemap = icu_casemap_create((char *) chain->locale,
792 (char) rule[0], status);
794 case ICU_chain_step_type_normalize:
795 step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
797 case ICU_chain_step_type_tokenize:
798 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
799 (char) rule[0], status);
809 void icu_chain_step_destroy(struct icu_chain_step * step){
814 icu_chain_step_destroy(step->previous);
817 case ICU_chain_step_type_display:
819 case ICU_chain_step_type_index:
821 case ICU_chain_step_type_sortkey:
823 case ICU_chain_step_type_casemap:
824 icu_casemap_destroy(step->u.casemap);
825 icu_buf_utf16_destroy(step->buf16);
827 case ICU_chain_step_type_normalize:
828 icu_normalizer_destroy(step->u.normalizer);
829 icu_buf_utf16_destroy(step->buf16);
831 case ICU_chain_step_type_tokenize:
832 icu_tokenizer_destroy(step->u.tokenizer);
833 icu_buf_utf16_destroy(step->buf16);
843 struct icu_chain * icu_chain_create(const uint8_t * identifier,
844 const uint8_t * locale)
847 struct icu_chain * chain
848 = (struct icu_chain *) malloc(sizeof(struct icu_chain));
850 strncpy((char *) chain->identifier, (const char *) identifier, 128);
851 chain->identifier[128 - 1] = '\0';
852 strncpy((char *) chain->locale, (const char *) locale, 16);
853 chain->locale[16 - 1] = '\0';
855 chain->token_count = 0;
857 chain->display8 = icu_buf_utf8_create(0);
858 chain->norm8 = icu_buf_utf8_create(0);
859 chain->sort8 = icu_buf_utf8_create(0);
861 chain->src16 = icu_buf_utf16_create(0);
869 void icu_chain_destroy(struct icu_chain * chain)
872 icu_buf_utf8_destroy(chain->display8);
873 icu_buf_utf8_destroy(chain->norm8);
874 icu_buf_utf8_destroy(chain->sort8);
876 icu_buf_utf16_destroy(chain->src16);
878 icu_chain_step_destroy(chain->steps);
885 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
886 UErrorCode * status){
889 struct icu_chain * chain = 0;
892 ||xml_node->type != XML_ELEMENT_NODE
893 || strcmp((const char *) xml_node->name, "icu_chain"))
897 xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
898 xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
900 if (!xml_id || !strlen((const char *) xml_id)
901 || !xml_locale || !strlen((const char *) xml_locale))
904 chain = icu_chain_create((const uint8_t *) xml_id,
905 (const uint8_t *) xml_locale);
912 for (node = xml_node->children; node; node = node->next)
914 if (node->type != XML_ELEMENT_NODE)
917 xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
918 struct icu_chain_step * step = 0;
920 if (!strcmp((const char *) node->name,
921 (const char *) "casemap")){
922 step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
923 (const uint8_t *) xml_rule, status);
925 else if (!strcmp((const char *) node->name,
926 (const char *) "normalize")){
927 step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
928 (const uint8_t *) xml_rule, status);
930 else if (!strcmp((const char *) node->name,
931 (const char *) "tokenize")){
932 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
933 (const uint8_t *) xml_rule, status);
935 else if (!strcmp((const char *) node->name,
936 (const char *) "display")){
937 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
938 (const uint8_t *) "", status);
940 else if (!strcmp((const char *) node->name,
941 (const char *) "index")){
942 step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
943 (const uint8_t *) "", status);
945 else if (!strcmp((const char *) node->name,
946 (const char *) "sortkey")){
947 step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey,
948 (const uint8_t *) "", status);
952 if (!step || U_FAILURE(*status)){
953 icu_chain_destroy(chain);
965 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
966 enum icu_chain_step_type type,
967 const uint8_t * rule,
970 struct icu_chain_step * step = 0;
971 struct icu_buf_utf16 * src16 = 0;
972 struct icu_buf_utf16 * buf16 = 0;
974 if (!chain || !type || !rule)
977 // assign utf16 src buffers as needed
978 if (chain->steps && chain->steps->buf16)
979 src16 = chain->steps->buf16;
980 else if (chain->src16)
981 src16 = chain->src16;
986 // create utf16 destination buffers as needed, or
988 case ICU_chain_step_type_display:
991 case ICU_chain_step_type_index:
994 case ICU_chain_step_type_sortkey:
997 case ICU_chain_step_type_casemap:
998 buf16 = icu_buf_utf16_create(0);
1000 case ICU_chain_step_type_normalize:
1001 buf16 = icu_buf_utf16_create(0);
1003 case ICU_chain_step_type_tokenize:
1004 buf16 = icu_buf_utf16_create(0);
1010 // create actual chain step with this buffer
1011 step = icu_chain_step_create(chain, type, rule, buf16, status);
1013 step->previous = chain->steps;
1014 chain->steps = step;
1020 int icu_chain_step_next_token(struct icu_chain * chain,
1021 struct icu_chain_step * step,
1024 struct icu_buf_utf16 * src16 = 0;
1026 //printf("icu_chain_step_next_token %d\n", (int) step);
1028 if (!chain || !chain->src16 || !step || !step->more_tokens)
1031 // assign utf16 src buffers as neeed, advance in previous steps
1032 // tokens until non-zero token met, and setting stop condition
1033 if (step->previous){
1034 src16 = step->previous->buf16;
1035 if (step->need_new_token)
1036 //while (step->more_tokens && !src16->utf16_len)
1038 = icu_chain_step_next_token(chain, step->previous, status);
1040 else { // first step can only work once on chain->src16 input buffer
1041 src16 = chain->src16;
1042 step->more_tokens = 1;
1045 // stop if nothing to process
1046 // i.e new token source was not properly assigned
1047 if (!step->more_tokens || !src16) // || !src16->utf16_len
1050 //printf("icu_chain_step_next_token %d working\n", (int) step);
1053 // perform the work, eventually put this steps output in
1054 // step->buf16 or the chains UTF8 output buffers
1055 switch(step->type) {
1056 case ICU_chain_step_type_display:
1057 icu_utf16_to_utf8(chain->display8, src16, status);
1059 case ICU_chain_step_type_index:
1060 icu_utf16_to_utf8(chain->norm8, src16, status);
1062 case ICU_chain_step_type_sortkey:
1063 icu_utf16_to_utf8(chain->sort8, src16, status);
1065 case ICU_chain_step_type_casemap:
1066 icu_casemap_casemap(step->u.casemap,
1067 step->buf16, src16, status);
1069 case ICU_chain_step_type_normalize:
1070 icu_normalizer_normalize(step->u.normalizer,
1071 step->buf16, src16, status);
1073 case ICU_chain_step_type_tokenize:
1074 // attach to new src16 token only first time during splitting
1075 if (step->need_new_token){
1076 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1077 step->need_new_token = 0;
1079 // splitting one src16 token into multiple buf16 tokens
1081 = icu_tokenizer_next_token(step->u.tokenizer,
1082 step->buf16, status);
1083 // make sure to get new previous token if this one had been used up
1084 if (step->previous && !step->more_tokens){
1085 if (icu_chain_step_next_token(chain, step->previous, status)){
1086 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1087 step->need_new_token = 0;
1089 = icu_tokenizer_next_token(step->u.tokenizer,
1090 step->buf16, status);
1093 if (0 == step->more_tokens)
1103 // stop further token processing if last step and
1104 // new tokens are needed from previous (non-existing) step
1105 if (!step->previous && step->need_new_token)
1106 step->more_tokens = 0;
1108 //printf("%d %d %d\n",
1109 // step->more_tokens, src16->utf16_len, step->buf16->utf16_len);
1112 if (U_FAILURE(*status))
1120 int icu_chain_assign_cstr(struct icu_chain * chain,
1121 const char * src8cstr,
1124 struct icu_chain_step * stp = 0;
1126 if (!chain || !src8cstr)
1131 // clear token count
1132 chain->token_count = 0;
1134 // clear all steps stop states
1137 stp->more_tokens = 1;
1138 stp->need_new_token = 1;
1139 stp = stp->previous;
1142 // finally convert UTF8 to UTF16 string
1143 icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
1145 if (U_FAILURE(*status))
1153 int icu_chain_next_token(struct icu_chain * chain,
1158 if (!chain || !chain->steps)
1161 success = icu_chain_step_next_token(chain, chain->steps, status);
1164 chain->token_count++;
1165 return chain->token_count;
1171 int icu_chain_get_token_count(struct icu_chain * chain)
1176 return chain->token_count;
1181 const char * icu_chain_get_display(struct icu_chain * chain)
1183 if (chain->display8)
1184 return icu_buf_utf8_to_cstr(chain->display8);
1189 const char * icu_chain_get_norm(struct icu_chain * chain)
1192 return icu_buf_utf8_to_cstr(chain->norm8);
1197 const char * icu_chain_get_sort(struct icu_chain * chain)
1200 return icu_buf_utf8_to_cstr(chain->sort8);
1216 * indent-tabs-mode: nil
1218 * vim: shiftwidth=4 tabstop=8 expandtab