1 /* $Id: test_icu_I18N.c,v 1.9 2007-05-07 12:18:34 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
31 #include <yaz/timing.h>
44 #include <unicode/ustring.h>
45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
48 #define MAX_KEY_SIZE 256
52 uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
53 char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string
58 int icu_termmap_cmp(const void *vp1, const void *vp2)
60 struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
61 struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
65 cmp = strcmp((const char *)itmp1->sort_key,
66 (const char *)itmp2->sort_key);
72 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
73 struct icu_buf_utf16 * src16,
74 const char *locale, char action,
77 int32_t dest16_len = 0;
81 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
82 src16->utf16, src16->utf16_len,
86 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
87 src16->utf16, src16->utf16_len,
91 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
92 src16->utf16, src16->utf16_len,
96 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
97 src16->utf16, src16->utf16_len,
98 U_FOLD_CASE_DEFAULT, status);
102 return U_UNSUPPORTED_ERROR;
106 // check for buffer overflow, resize and retry
107 if (*status == U_BUFFER_OVERFLOW_ERROR
108 //|| dest16_len > dest16->utf16_cap
110 icu_buf_utf16_resize(dest16, dest16_len * 2);
111 *status = U_ZERO_ERROR;
116 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
117 src16->utf16, src16->utf16_len,
121 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
122 src16->utf16, src16->utf16_len,
126 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
127 src16->utf16, src16->utf16_len,
131 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
132 src16->utf16, src16->utf16_len,
133 U_FOLD_CASE_DEFAULT, status);
137 return U_UNSUPPORTED_ERROR;
142 if (U_SUCCESS(*status)
143 && dest16_len < dest16->utf16_cap)
144 dest16->utf16_len = dest16_len;
146 dest16->utf16[0] = (UChar) 0;
147 dest16->utf16_len = 0;
155 int test_icu_casemap(const char * locale, char action,
156 const char * src8cstr, const char * chk8cstr)
159 UErrorCode status = U_ZERO_ERROR;
161 struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
162 struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
163 struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
164 struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
167 int src8cstr_len = strlen(src8cstr);
168 int chk8cstr_len = strlen(chk8cstr);
170 // converting to UTF16
171 icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
173 // perform case mapping
174 icu_utf16_casemap(dest16, src16, locale, action, &status);
176 // converting to UTF8
177 icu_utf16_to_utf8(dest8, dest16, &status);
183 && (dest8->utf8_len == strlen(chk8cstr))
184 && !strcmp(chk8cstr, (const char *) dest8->utf8))
192 printf("original string: '%s' (%d)\n", src8cstr, src8cstr_len);
193 printf("icu_casemap '%s:%c' '%s' (%d)\n",
194 locale, action, dest8->utf8, dest8->utf8_len);
195 printf("expected string: '%s' (%d)\n", chk8cstr, chk8cstr_len);
199 icu_buf_utf8_destroy(src8);
200 icu_buf_utf8_destroy(dest8);
201 icu_buf_utf16_destroy(src16);
202 icu_buf_utf16_destroy(dest16);
212 int test_icu_casemap(const char * locale, char action,
213 const char * src8, const char * check8)
215 NMEM nmem = nmem_create();
216 size_t buf_cap = 128;
218 const char * dest8 = 0;
219 size_t dest8_len = 0;
220 //size_t src8_len = strlen(src8);
223 //printf("original string: '%s' (%d)\n", src8, (int) src8_len);
225 //these shall succeed
226 dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
227 src8, locale, action);
230 //printf("icu_casemap '%s:%c' '%s' (%d)\n",
231 // locale, action, dest8, (int) dest8_len);
234 && (dest8_len == strlen(check8))
235 && !strcmp(check8, dest8))
246 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
248 void test_icu_I18N_casemap(int argc, char **argv)
254 YAZ_CHECK(test_icu_casemap("en", 'l',
255 "A ReD fOx hunTS sQUirriLs",
256 "a red fox hunts squirrils"));
258 YAZ_CHECK(test_icu_casemap("en", 'u',
259 "A ReD fOx hunTS sQUirriLs",
260 "A RED FOX HUNTS SQUIRRILS"));
262 YAZ_CHECK(test_icu_casemap("en", 'f',
263 "A ReD fOx hunTS sQUirriLs",
264 "a red fox hunts squirrils"));
266 YAZ_CHECK(test_icu_casemap("en", 't',
267 "A ReD fOx hunTS sQUirriLs",
268 "A Red Fox Hunts Squirrils"));
274 YAZ_CHECK(test_icu_casemap("da", 'l',
275 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
276 "åh æble, øs fløde i åen efter blåbærgrøden"));
278 YAZ_CHECK(test_icu_casemap("da", 'u',
279 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
280 "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
282 YAZ_CHECK(test_icu_casemap("da", 'f',
283 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
284 "åh æble, øs fløde i åen efter blåbærgrøden"));
286 YAZ_CHECK(test_icu_casemap("da", 't',
287 "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
288 "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
293 YAZ_CHECK(test_icu_casemap("de", 'l',
294 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
295 "zwölf ärgerliche würste rollen über die straße"));
297 YAZ_CHECK(test_icu_casemap("de", 'u',
298 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
299 "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
301 YAZ_CHECK(test_icu_casemap("de", 'f',
302 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
303 "zwölf ärgerliche würste rollen über die strasse"));
305 YAZ_CHECK(test_icu_casemap("de", 't',
306 "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
307 "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
314 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
316 void test_icu_I18N_casemap_failures(int argc, char **argv)
319 size_t buf_cap = 128;
321 size_t dest8_len = 0;
322 NMEM nmem = nmem_create();
325 const char * src8 = "A ReD fOx hunTS sQUirriLs";
326 //size_t src8_len = strlen(src8);
328 //printf("original string: '%s' (%d)\n", src8, (int) src8_len);
330 // some calling error needs investigation
331 dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
333 YAZ_CHECK(0 == dest8_len);
334 //printf("icu_casemap 'en:t' '%s' (%d)\n", dest8, (int) dest8_len);
337 // attention: does not fail even if no locale 'xy_zz' defined
338 // it seems to default to english locale
339 dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
340 src8, "zz_abc", 'l');
341 YAZ_CHECK(dest8_len);
342 //printf("icu_casemap 'zz:l' '%s' (%d)\n", dest8, (int) dest8_len);
345 // shall fail - no buf buffer defined
346 dest8 = icu_casemap(nmem, 0, buf_cap, &dest8_len,
348 YAZ_CHECK(0 == dest8_len);
349 //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
351 // shall fail - no buf_cap defined
352 dest8 = icu_casemap(nmem, buf, 0, &dest8_len,
354 YAZ_CHECK(0 == dest8_len);
355 //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
357 // shall fail - no action 'x' defined
358 dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
360 YAZ_CHECK(0 == dest8_len);
361 //printf("icu_casemap 'en:x' '%s' (%d)\n", dest8, (int) dest8_len);
370 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
373 int test_icu_sortmap(const char * locale, size_t list_len,
374 const char ** src8_list, const char ** check8_list)
381 NMEM nmem = nmem_create();
382 size_t buf_cap = 128;
384 struct icu_termmap ** dest8_list
385 = nmem_malloc(nmem, sizeof(struct icu_termmap *) * list_len);
386 //size_t dest8_len = 0;
387 //size_t src8_len = strlen(src8);
389 // initializing icu_termmap
390 for (i = 0; i < list_len; i++){
391 dest8_list[i] = icu_termmap_create(nmem);
392 dest8_list[i]->norm_term = nmem_strdup(nmem, src8_list[i]);
393 dest8_list[i]->disp_term = nmem_strdup(nmem, src8_list[i]);
394 //dest8_list[i]->sort_key = nmem_strdup(nmem, src8_list[i]);
395 //dest8_list[i]->sort_len = strlen(src8_list[i]);
396 dest8_list[i]->sort_key
397 = icu_sortmap(nmem, buf, buf_cap, 0, src8_list[i], locale);
398 // = icu_sortmap(nmem, buf, buf_cap, &(dest8_list[i]->sort_len),
399 // src8_list[i], locale);
403 qsort(dest8_list, list_len,
404 sizeof(struct icu_termmap *), icu_termmap_cmp);
406 // checking correct sorting
407 for (i = 0; i < list_len; i++){
408 if (0 != strcmp(dest8_list[i]->disp_term, check8_list[i])){
415 printf("Input '%s':", locale);
416 for (i = 0; i < list_len; i++)
417 printf(" '%s'", src8_list[i]);
419 printf("ICU sort '%s':", locale);
420 for (i = 0; i < list_len; i++)
421 printf(" '%s'", dest8_list[i]->disp_term);
427 printf("Expected '%s':", locale);
428 for (i = 0; i < list_len; i++)
429 printf(" '%s'", check8_list[i]);
440 int test_icu_sortmap(const char * locale, int src_list_len,
441 const char ** src_list, const char ** chk_list)
445 UErrorCode status = U_ZERO_ERROR;
447 struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
448 struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
452 struct icu_termmap * list[src_list_len];
454 UCollator *coll = ucol_open(locale, &status);
455 icu_check_status(status);
457 if(!U_SUCCESS(status))
460 // assigning display terms and sort keys using buf 8 and buf16
461 for( i = 0; i < src_list_len; i++)
464 list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
467 strcpy(list[i]->disp_term, src_list[i]);
469 // transforming to UTF16
470 icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
471 icu_check_status(status);
473 // computing sortkeys
474 icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
475 icu_check_status(status);
477 // assigning sortkeys
478 memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
479 //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
480 //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
485 qsort(list, src_list_len,
486 sizeof(struct icu_termmap *), icu_termmap_cmp);
488 // checking correct sorting
489 for (i = 0; i < src_list_len; i++){
490 if (0 != strcmp(list[i]->disp_term, chk_list[i])){
497 printf("Input str: '%s' : ", locale);
498 for (i = 0; i < src_list_len; i++) {
499 printf(" '%s'", list[i]->disp_term);
502 printf("ICU sort: '%s' : ", locale);
503 for (i = 0; i < src_list_len; i++) {
504 printf(" '%s'", list[i]->disp_term);
505 //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
508 printf("Expected: '%s' : ", locale);
509 for (i = 0; i < src_list_len; i++) {
510 printf(" '%s'", chk_list[i]);
518 icu_buf_utf8_destroy(buf8);
519 icu_buf_utf16_destroy(buf16);
530 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
532 void test_icu_I18N_sortmap(int argc, char **argv)
537 const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
538 const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
539 YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
540 YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
541 YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
542 YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
543 YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
547 const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
548 const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
549 YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
550 YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
554 const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
555 const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
556 YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
557 YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
558 YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
565 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
567 int main(int argc, char **argv)
570 YAZ_CHECK_INIT(argc, argv);
575 //test_icu_I18N_casemap_failures(argc, argv);
576 test_icu_I18N_casemap(argc, argv);
577 test_icu_I18N_sortmap(argc, argv);
581 printf("ICU unit tests omitted.\n"
582 "Please install libicu36-dev and icu-doc or similar\n");
591 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
596 * indent-tabs-mode: nil
598 * vim: shiftwidth=4 tabstop=8 expandtab