2 * Copyright (C) 2005-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: tst_icu_I18N.c,v 1.15 2007-11-12 11:11:16 adam Exp $
9 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
18 #include <yaz/timing.h>
24 #include <yaz/icu_I18N.h>
29 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
32 #define MAX_KEY_SIZE 256
35 uint8_t sort_key[MAX_KEY_SIZE]; /* standard C string '\0' terminated */
36 char disp_term[MAX_KEY_SIZE]; /* standard C utf-8 string */
41 int icu_termmap_cmp(const void *vp1, const void *vp2)
43 struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
44 struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
48 cmp = strcmp((const char *)itmp1->sort_key,
49 (const char *)itmp2->sort_key);
56 int test_icu_casemap(const char * locale, char action,
57 const char * src8cstr, const char * chk8cstr)
60 UErrorCode status = U_ZERO_ERROR;
62 struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
63 struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
64 struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
65 struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
68 int src8cstr_len = strlen(src8cstr);
69 int chk8cstr_len = strlen(chk8cstr);
71 /* converting to UTF16 */
72 icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
74 /* perform case mapping */
75 icu_utf16_casemap(dest16, src16, locale, action, &status);
77 /* converting to UTF8 */
78 icu_utf16_to_utf8(dest8, dest16, &status);
82 /* determine success */
84 && (dest8->utf8_len == strlen(chk8cstr))
85 && !strcmp(chk8cstr, (const char *) dest8->utf8))
93 printf("original string: '%s' (%d)\n", src8cstr, src8cstr_len);
94 printf("icu_casemap '%s:%c' '%s' (%d)\n",
95 locale, action, dest8->utf8, dest8->utf8_len);
96 printf("expected string: '%s' (%d)\n", chk8cstr, chk8cstr_len);
99 /* clean the buffers */
100 icu_buf_utf8_destroy(src8);
101 icu_buf_utf8_destroy(dest8);
102 icu_buf_utf16_destroy(src16);
103 icu_buf_utf16_destroy(dest16);
111 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
113 void test_icu_I18N_casemap(int argc, char **argv)
118 /* successful tests */
119 YAZ_CHECK(test_icu_casemap("en", 'l',
120 "A ReD fOx hunTS sQUirriLs",
121 "a red fox hunts squirrils"));
123 YAZ_CHECK(test_icu_casemap("en", 'u',
124 "A ReD fOx hunTS sQUirriLs",
125 "A RED FOX HUNTS SQUIRRILS"));
127 YAZ_CHECK(test_icu_casemap("en", 'f',
128 "A ReD fOx hunTS sQUirriLs",
129 "a red fox hunts squirrils"));
131 YAZ_CHECK(test_icu_casemap("en", 't',
132 "A ReD fOx hunTS sQUirriLs",
133 "A Red Fox Hunts Squirrils"));
138 /* success expected */
139 YAZ_CHECK(test_icu_casemap("da", 'l',
140 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
141 "åh æble, øs fløde i åen efter blåbærgrøden"));
143 YAZ_CHECK(test_icu_casemap("da", 'u',
144 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
145 "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
147 YAZ_CHECK(test_icu_casemap("da", 'f',
148 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
149 "åh æble, øs fløde i åen efter blåbærgrøden"));
151 YAZ_CHECK(test_icu_casemap("da", 't',
152 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
153 "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
157 /* success expected */
158 YAZ_CHECK(test_icu_casemap("de", 'l',
159 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
160 "zwölf ärgerliche würste rollen über die straße"));
162 YAZ_CHECK(test_icu_casemap("de", 'u',
163 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
164 "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
166 YAZ_CHECK(test_icu_casemap("de", 'f',
167 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
168 "zwölf ärgerliche würste rollen über die strasse"));
170 YAZ_CHECK(test_icu_casemap("de", 't',
171 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
172 "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
177 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
179 int test_icu_sortmap(const char * locale, int src_list_len,
180 const char ** src_list, const char ** chk_list)
184 UErrorCode status = U_ZERO_ERROR;
186 struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
187 struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
191 struct icu_termmap * list[src_list_len];
193 UCollator *coll = ucol_open(locale, &status);
194 icu_check_status(status);
196 if(U_FAILURE(status))
199 /* assigning display terms and sort keys using buf 8 and buf16 */
200 for( i = 0; i < src_list_len; i++)
203 list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
205 /* copy display term */
206 strcpy(list[i]->disp_term, src_list[i]);
208 /* transforming to UTF16 */
209 icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
210 icu_check_status(status);
212 /* computing sortkeys */
213 icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
214 icu_check_status(status);
216 /* assigning sortkeys */
217 memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
222 qsort(list, src_list_len,
223 sizeof(struct icu_termmap *), icu_termmap_cmp);
225 /* checking correct sorting */
226 for (i = 0; i < src_list_len; i++){
227 if (0 != strcmp(list[i]->disp_term, chk_list[i])){
234 printf("Input str: '%s' : ", locale);
235 for (i = 0; i < src_list_len; i++) {
236 printf(" '%s'", list[i]->disp_term);
239 printf("ICU sort: '%s' : ", locale);
240 for (i = 0; i < src_list_len; i++) {
241 printf(" '%s'", list[i]->disp_term);
244 printf("Expected: '%s' : ", locale);
245 for (i = 0; i < src_list_len; i++) {
246 printf(" '%s'", chk_list[i]);
253 for( i = 0; i < src_list_len; i++)
259 icu_buf_utf8_destroy(buf8);
260 icu_buf_utf16_destroy(buf16);
266 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
268 void test_icu_I18N_sortmap(int argc, char **argv)
271 /* successful tests */
273 const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
274 const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
275 YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
276 YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
277 YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
278 YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
279 YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
281 /* successful tests */
284 const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
285 const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
286 YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
287 YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
289 /* successful tests */
292 const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
293 const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
294 YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
295 YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
296 YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
302 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
307 int test_icu_normalizer(const char * rules8cstr,
308 const char * src8cstr,
309 const char * chk8cstr)
313 UErrorCode status = U_ZERO_ERROR;
315 struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
316 struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
317 struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
318 struct icu_normalizer * normalizer
319 = icu_normalizer_create(rules8cstr, 'f', &status);
320 icu_check_status(status);
322 icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
323 icu_check_status(status);
325 icu_normalizer_normalize(normalizer, dest16, src16, &status);
326 icu_check_status(status);
328 icu_utf16_to_utf8(dest8, dest16, &status);
329 icu_check_status(status);
332 if(!strcmp((const char *) dest8->utf8,
333 (const char *) chk8cstr))
337 printf("Normalization\n");
338 printf("Rules: '%s'\n", rules8cstr);
339 printf("Input: '%s'\n", src8cstr);
340 printf("Normalized: '%s'\n", dest8->utf8);
341 printf("Expected: '%s'\n", chk8cstr);
345 icu_normalizer_destroy(normalizer);
346 icu_buf_utf16_destroy(src16);
347 icu_buf_utf16_destroy(dest16);
348 icu_buf_utf8_destroy(dest8);
354 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
356 void test_icu_I18N_normalizer(int argc, char **argv)
359 YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove",
363 YAZ_CHECK(test_icu_normalizer("[:Control:] Any-Remove",
367 YAZ_CHECK(test_icu_normalizer("[:Decimal_Number:] Any-Remove",
371 YAZ_CHECK(test_icu_normalizer("Lower; [:^Letter:] Remove",
375 YAZ_CHECK(test_icu_normalizer("[:^Number:] Remove",
376 "Monday 15th of April",
379 YAZ_CHECK(test_icu_normalizer("Lower;"
380 "[[:WhiteSpace:][:Punctuation:]] Remove",
385 YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC",
386 "à côté de l'alcôve ovoïde",
387 "a cote de l'alcove ovoide"));
392 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
394 int test_icu_tokenizer(const char * locale, char action,
395 const char * src8cstr, int count)
399 UErrorCode status = U_ZERO_ERROR;
400 struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
401 struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
402 struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
403 struct icu_tokenizer * tokenizer = 0;
405 /* transforming to UTF16 */
406 icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
407 icu_check_status(status);
409 /* set up tokenizer */
410 tokenizer = icu_tokenizer_create(locale, action, &status);
411 icu_check_status(status);
412 YAZ_CHECK(tokenizer);
414 /* attach text buffer to tokenizer */
415 icu_tokenizer_attach(tokenizer, src16, &status);
416 icu_check_status(status);
417 YAZ_CHECK(tokenizer->bi);
419 /* perform work on tokens */
420 while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
421 icu_check_status(status);
423 /* converting to UTF8 */
424 icu_utf16_to_utf8(tkn8, tkn16, &status);
427 if (count != icu_tokenizer_token_count(tokenizer)){
429 printf("\nTokenizer '%s:%c' Error: \n", locale, action);
430 printf("Input: '%s'\n", src8cstr);
431 printf("Tokens: %d", icu_tokenizer_token_count(tokenizer));
432 printf(", expected: %d\n", count);
435 icu_tokenizer_destroy(tokenizer);
436 icu_buf_utf16_destroy(src16);
437 icu_buf_utf16_destroy(tkn16);
438 icu_buf_utf8_destroy(tkn8);
444 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
446 void test_icu_I18N_tokenizer(int argc, char **argv)
451 = "O Romeo, Romeo! wherefore art thou Romeo?";
453 YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2));
454 YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7));
455 YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16));
456 YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41));
462 = "Blåbærtærte. Denne kage stammer fra Finland. "
463 "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
465 YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
466 YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
467 YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
468 YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
474 void test_icu_I18N_chain(int argc, char **argv)
477 = "O Romeo, Romeo! wherefore art thou\t Romeo?";
479 UErrorCode status = U_ZERO_ERROR;
480 struct icu_chain * chain = 0;
483 const char * xml_str = "<icu locale=\"en\">"
484 "<transform rule=\"[:Control:] Any-Remove\"/>"
485 "<tokenize rule=\"l\"/>"
486 "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
488 "<casemap rule=\"l\"/>"
492 xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
493 xmlNode *xml_node = xmlDocGetRootElement(doc);
496 chain = icu_chain_xml_config(xml_node, 0, &status);
501 YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));
503 while (icu_chain_next_token(chain, &status)){
505 /* printf("%d '%s' '%s'\n",
506 icu_chain_token_number(chain),
507 icu_chain_token_norm(chain),
508 icu_chain_token_display(chain)); */
511 YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);
514 YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
516 while (icu_chain_next_token(chain, &status)){
518 /* printf("%d '%s' '%s'\n",
519 icu_chain_token_number(chain),
520 icu_chain_token_norm(chain),
521 icu_chain_token_display(chain)); */
525 YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);
527 icu_chain_destroy(chain);
531 void test_bug_1140(void)
533 UErrorCode status = U_ZERO_ERROR;
534 struct icu_chain * chain = 0;
536 const char * xml_str = "<icu locale=\"en\">"
538 /* if the first rule is normalize instead. Then it works */
540 "<transform rule=\"[:Control:] Any-Remove\"/>"
542 "<tokenize rule=\"l\"/>"
543 "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
545 "<casemap rule=\"l\"/>"
549 xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
550 xmlNode *xml_node = xmlDocGetRootElement(doc);
553 chain = icu_chain_xml_config(xml_node, 0, &status);
558 YAZ_CHECK(icu_chain_assign_cstr(
559 chain, "O Romeo, Romeo! wherefore art thou\t Romeo?",
562 while (icu_chain_next_token(chain, &status)){
564 /* printf("%d '%s' '%s'\n",
565 icu_chain_token_number(chain),
566 icu_chain_token_norm(chain),
567 icu_chain_token_display(chain)); */
572 YAZ_CHECK_EQ(icu_chain_token_number(chain), 7);
574 YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
576 while (icu_chain_next_token(chain, &status)){
578 /* printf("%d '%s' '%s'\n",
579 icu_chain_token_number(chain),
580 icu_chain_token_norm(chain),
581 icu_chain_token_display(chain)); */
584 /* we expect 'what' 'is' 'this', i.e. 3 tokens */
585 YAZ_CHECK_EQ(icu_chain_token_number(chain), 3);
587 icu_chain_destroy(chain);
592 void test_chain_empty_token(void)
594 UErrorCode status = U_ZERO_ERROR;
595 struct icu_chain * chain = 0;
597 const char * xml_str = "<icu locale=\"en\">"
598 "<tokenize rule=\"w\"/>"
599 "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
602 xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
603 xmlNode *xml_node = xmlDocGetRootElement(doc);
606 chain = icu_chain_xml_config(xml_node, 0, &status);
611 YAZ_CHECK(icu_chain_assign_cstr(
612 chain, "a string with 15 tokenss and 8 displays",
615 while (icu_chain_next_token(chain, &status)){
617 /* printf("%d '%s' '%s'\n",
618 icu_chain_token_number(chain),
619 icu_chain_token_norm(chain),
620 icu_chain_token_display(chain)); */
623 YAZ_CHECK_EQ(icu_chain_token_number(chain), 15);
625 icu_chain_destroy(chain);
628 void test_chain_empty_chain(void)
630 UErrorCode status = U_ZERO_ERROR;
631 struct icu_chain * chain = 0;
633 const char * xml_str = "<icu locale=\"en\">"
636 const char * src8 = "some 5487 weired !¤%&(/& sTuFf";
639 xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
640 xmlNode *xml_node = xmlDocGetRootElement(doc);
643 chain = icu_chain_xml_config(xml_node, 0, &status);
648 YAZ_CHECK(icu_chain_assign_cstr(
652 while (icu_chain_next_token(chain, &status)){
654 /* printf("%d '%s' '%s'\n",
655 icu_chain_token_number(chain),
656 icu_chain_token_norm(chain),
657 icu_chain_token_display(chain)); */
660 YAZ_CHECK_EQ(icu_chain_token_number(chain), 1);
662 dest8 = (char *) icu_chain_token_norm(chain);
663 YAZ_CHECK_EQ(strcmp(src8, dest8), 0);
666 icu_chain_destroy(chain);
669 #endif /* YAZ_HAVE_ICU */
671 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
673 int main(int argc, char **argv)
676 YAZ_CHECK_INIT(argc, argv);
681 test_icu_I18N_casemap(argc, argv);
682 test_icu_I18N_sortmap(argc, argv);
683 test_icu_I18N_normalizer(argc, argv);
684 test_icu_I18N_tokenizer(argc, argv);
685 test_icu_I18N_chain(argc, argv);
686 test_chain_empty_token();
687 test_chain_empty_chain();
690 #else /* YAZ_HAVE_ICU */
692 printf("ICU unit tests omitted.\n"
693 "Please install libicu36-dev and icu-doc or similar\n");
696 #endif /* YAZ_HAVE_ICU */
702 /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
709 * indent-tabs-mode: nil
711 * vim: shiftwidth=4 tabstop=8 expandtab