2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: tsticonv.c,v 1.30 2007-09-17 19:15:22 adam Exp $
17 #include <yaz/yaz-util.h>
20 static int compare_buffers(char *msg, int no,
21 int expect_len, const char *expect_buf,
22 int got_len, const char *got_buf)
24 if (expect_len == got_len
25 && !memcmp(expect_buf, got_buf, expect_len))
28 if (0) /* use 1 see how the buffers differ (for debug purposes) */
31 printf("tsticonv test=%s i=%d failed\n", msg, no);
32 printf("off got exp\n");
33 for (i = 0; i<got_len || i<expect_len; i++)
39 sprintf(got_char, "%02X", got_buf[i]);
41 sprintf(got_char, "? ");
44 sprintf(expect_char, "%02X", expect_buf[i]);
46 sprintf(expect_char, "? ");
48 printf("%02d %s %s %c\n",
49 i, got_char, expect_char, got_buf[i] == expect_buf[i] ?
57 static int tst_convert_l(yaz_iconv_t cd, size_t in_len, const char *in_buf,
58 size_t expect_len, const char *expect_buf)
61 char *inbuf= (char*) in_buf;
62 size_t inbytesleft = in_len > 0 ? in_len : strlen(in_buf);
64 char *outbuf = outbuf0;
68 size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf;
69 if (outbytesleft > 12)
71 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
72 if (r == (size_t) (-1))
74 int e = yaz_iconv_error(cd);
75 if (e != YAZ_ICONV_E2BIG)
80 yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
85 return compare_buffers("tsticonv 22", 0,
86 expect_len, expect_buf,
87 outbuf - outbuf0, outbuf0);
90 static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf)
93 WRBUF b = wrbuf_alloc();
95 size_t inbytesleft = strlen(buf);
96 const char *inp = buf;
98 for (rounds = 0; inbytesleft && rounds < sizeof(outbuf); rounds++)
100 size_t outbytesleft = sizeof(outbuf);
102 size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft,
103 &outp, &outbytesleft);
104 wrbuf_write(b, outbuf, outp - outbuf);
105 if (r == (size_t) (-1))
107 int e = yaz_iconv_error(cd);
108 if (e != YAZ_ICONV_E2BIG)
113 size_t outbytesleft = sizeof(outbuf);
115 r = yaz_iconv(cd, 0, 0, &outp, &outbytesleft);
116 wrbuf_write(b, outbuf, outp - outbuf);
120 if (wrbuf_len(b) == strlen(cmpbuf)
121 && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b)))
125 WRBUF w = wrbuf_alloc();
128 wrbuf_verbose_str(w, buf, strlen(buf));
129 yaz_log(YLOG_LOG, "input %s", wrbuf_cstr(w));
132 wrbuf_verbose_str(w, wrbuf_buf(b), wrbuf_len(b));
133 yaz_log(YLOG_LOG, "got %s", wrbuf_cstr(w));
136 wrbuf_verbose_str(w, cmpbuf, strlen(cmpbuf));
137 yaz_log(YLOG_LOG, "expected %s", wrbuf_cstr(w));
147 /* some test strings in ISO-8859-1 format */
148 static const char *iso_8859_1_a[] = {
157 static void tst_marc8_to_ucs4b(void)
159 yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8");
164 YAZ_CHECK(tst_convert_l(
167 "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o",
169 "\x00\x00\xFF\x1F" "\x00\x00\x00o"));
170 YAZ_CHECK(tst_convert_l(
173 "\033$1" "\x6F\x77\x29" /* AE0E */
174 "\x6F\x52\x7C" /* c0F4 */ "\033(B",
176 "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4"));
177 YAZ_CHECK(tst_convert_l(
181 "\x21\x50\x6E" /* UCS 7CFB */
182 "\x21\x51\x31" /* UCS 7D71 */
183 "\x21\x3A\x67" /* UCS 5B89 */
184 "\x21\x33\x22" /* UCS 5168 */
185 "\x21\x33\x53" /* UCS 5206 */
186 "\x21\x44\x2B" /* UCS 6790 */
194 "\x00\x00\x67\x90"));
196 YAZ_CHECK(tst_convert_l(
199 "\xB0\xB2", /* AYN and oSLASH */
201 "\x00\x00\x02\xBB" "\x00\x00\x00\xF8"));
202 YAZ_CHECK(tst_convert_l(
205 "\xF6\x61", /* a underscore */
207 "\x00\x00\x00\x61" "\x00\x00\x03\x32"));
209 YAZ_CHECK(tst_convert_l(
212 "\x61\xC2", /* a, phonorecord mark */
214 "\x00\x00\x00\x61" "\x00\x00\x21\x17"));
217 YAZ_CHECK(tst_convert_l(
220 "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */
229 "\x00\x00\x00" "n"));
231 YAZ_CHECK(tst_convert_l(
236 "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08"));
238 YAZ_CHECK(tst_convert_l(
243 "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73"));
245 YAZ_CHECK(tst_convert_l(
250 "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73"));
255 static void tst_ucs4b_to_utf8(void)
257 yaz_iconv_t cd = yaz_iconv_open("UTF8", "UCS4");
261 YAZ_CHECK(tst_convert_l(
264 "\x00\x00\xFF\x1F\x00\x00\x00o",
266 "\xEF\xBC\x9F\x6F"));
268 YAZ_CHECK(tst_convert_l(
271 "\x00\x00\xAE\x0E\x00\x00\xC0\xF4",
273 "\xEA\xB8\x8E\xEC\x83\xB4"));
277 static void dconvert(int mandatory, const char *tmpcode)
282 for (i = 0; iso_8859_1_a[i]; i++)
285 char *inbuf = (char*) iso_8859_1_a[i];
286 size_t inbytesleft = strlen(inbuf);
289 char *outbuf = outbuf0;
290 size_t outbytesleft = sizeof(outbuf0);
292 cd = yaz_iconv_open(tmpcode, "ISO-8859-1");
293 YAZ_CHECK(cd || !mandatory);
296 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
297 YAZ_CHECK(r != (size_t) (-1));
299 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
300 YAZ_CHECK(r != (size_t) (-1));
302 if (r == (size_t) (-1))
305 cd = yaz_iconv_open("ISO-8859-1", tmpcode);
306 YAZ_CHECK(cd || !mandatory);
310 inbytesleft = sizeof(outbuf0) - outbytesleft;
313 outbytesleft = sizeof(outbuf1);
314 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
315 YAZ_CHECK(r != (size_t) (-1));
317 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
318 if (r == (size_t)(-1))
320 fprintf(stderr, "failed\n");
322 YAZ_CHECK(r != (size_t) (-1));
324 if (r != (size_t)(-1))
326 ret = compare_buffers("dconvert", i,
327 strlen(iso_8859_1_a[i]), iso_8859_1_a[i],
328 sizeof(outbuf1) - outbytesleft, outbuf1);
335 int utf8_check(unsigned c)
344 size_t inbytesleft = 4;
345 char *outbuf = utf8buf;
346 size_t outbytesleft = sizeof(utf8buf);
348 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "UCS4LE");
351 for (i = 0; i<4; i++)
354 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
357 if (r == (size_t)(-1))
360 cd = yaz_iconv_open("UCS4LE", "UTF-8");
363 inbytesleft = sizeof(utf8buf) - outbytesleft;
369 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
370 if (r == (size_t)(-1))
375 if (memcmp(src, dst, 4))
381 static void tst_marc8_to_utf8(void)
383 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8");
389 YAZ_CHECK(tst_convert(cd, "Cours de math",
391 /* COMBINING ACUTE ACCENT */
392 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
393 "Cours de mathe\xcc\x81"));
396 YAZ_CHECK(tst_convert(cd, "a\xea\x1e", "a\x1e\xcc\x8a"));
398 YAZ_CHECK(tst_convert(cd, "a\xea", "a"));
402 static void tst_marc8s_to_utf8(void)
404 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8s");
410 YAZ_CHECK(tst_convert(cd, "Cours de math",
412 /* E9: LATIN SMALL LETTER E WITH ACUTE */
413 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
414 "Cours de math\xc3\xa9"));
420 static void tst_marc8_to_latin1(void)
422 yaz_iconv_t cd = yaz_iconv_open("ISO-8859-1", "MARC8");
428 YAZ_CHECK(tst_convert(cd, "ax", "ax"));
430 /* latin capital letter o with stroke */
431 YAZ_CHECK(tst_convert(cd, "\xa2", "\xd8"));
433 /* with latin small letter ae */
434 YAZ_CHECK(tst_convert(cd, "eneb\xb5r", "eneb\346r"));
436 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2", "\xe5" "\xd8"));
438 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2" "b", "\xe5" "\xd8" "b"));
440 YAZ_CHECK(tst_convert(cd, "\xea" "a" "\xea" "a", "\xe5" "\xe5"));
442 YAZ_CHECK(tst_convert(cd, "Cours de math",
444 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
446 YAZ_CHECK(tst_convert(cd, "12345678âe",
448 YAZ_CHECK(tst_convert(cd, "123456789âe",
450 YAZ_CHECK(tst_convert(cd, "1234567890âe",
452 YAZ_CHECK(tst_convert(cd, "12345678901âe",
454 YAZ_CHECK(tst_convert(cd, "Cours de mathâem",
456 YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques",
457 "Cours de mathématiques"));
462 static void tst_utf8_to_marc8(void)
464 yaz_iconv_t cd = yaz_iconv_open("MARC8", "UTF-8");
470 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
472 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
473 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
475 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
476 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
478 /** Pure ASCII. 13 characters (sizeof(outbuf)+1) */
479 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
481 /** UPPERCASE SCANDINAVIAN O */
482 YAZ_CHECK(tst_convert(cd, "S\xc3\x98", "S\xa2"));
485 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x8A", "\xEA" "A"));
487 /** A MACRON + UMLAUT, DIAERESIS */
488 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x84" "\xCC\x88",
491 /* Ligature spanning two characters */
492 YAZ_CHECK(tst_convert(cd,
493 "\x74" "\xCD\xA1" "\x73", /* UTF-8 */
494 "\xEB\x74\xEC\x73")); /* MARC-8 */
496 /* Double title spanning two characters */
497 YAZ_CHECK(tst_convert(cd,
498 "\x74" "\xCD\xA0" "\x73", /* UTF-8 */
499 "\xFA\x74\xFB\x73")); /* MARC-8 */
501 /** Ideographic question mark (Unicode FF1F) */
502 YAZ_CHECK(tst_convert(cd,
503 "\xEF\xBC\x9F" "o", /* UTF-8 */
504 "\033$1" "\x21\x2B\x3B" "\033(B" "o" ));
507 /** Superscript 0 . bug #642 */
508 YAZ_CHECK(tst_convert(cd,
509 "(\xe2\x81\xb0)", /* UTF-8 */
514 char *inbuf0 = "\xe2\x81\xb0";
515 char *inbuf = inbuf0;
516 size_t inbytesleft = strlen(inbuf);
518 char *outbuf = outbuf0;
519 size_t outbytesleft = sizeof(outbuf0)-1;
524 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
525 YAZ_CHECK(r != (size_t) (-1));
528 *outbuf = '\0'; /* so we know when to stop printing */
529 for (i = 0; outbuf0[i]; i++)
531 int ch = outbuf0[i] & 0xff;
532 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
536 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
537 YAZ_CHECK(r != (size_t) (-1));
538 *outbuf = '\0'; /* for strcmp test below and printing */
540 for (i = 0; outbuf0[i]; i++)
542 int ch = outbuf0[i] & 0xff;
543 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
546 YAZ_CHECK(strcmp("\033p0\x1bs", outbuf0) == 0);
551 static void tst_advance_to_utf8(void)
553 yaz_iconv_t cd = yaz_iconv_open("utf-8", "advancegreek");
559 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
563 static void tst_utf8_to_advance(void)
565 yaz_iconv_t cd = yaz_iconv_open("advancegreek", "utf-8");
571 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
575 static void tst_latin1_to_marc8(void)
577 yaz_iconv_t cd = yaz_iconv_open("MARC8", "ISO-8859-1");
583 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
585 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
586 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
588 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
589 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
591 /** Pure ASCII. 13 characters (sizeof(outbuf)) */
592 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
594 /** D8: UPPERCASE SCANDINAVIAN O */
595 YAZ_CHECK(tst_convert(cd, "S\xd8", "S\xa2"));
597 /** E9: LATIN SMALL LETTER E WITH ACUTE */
598 YAZ_CHECK(tst_convert(cd, "Cours de math\xe9", "Cours de mathâe"));
599 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"
601 YAZ_CHECK(tst_convert(cd, "Cours de mathé", "Cours de mathâe" ));
602 YAZ_CHECK(tst_convert(cd, "12345678é","12345678âe"));
603 YAZ_CHECK(tst_convert(cd, "123456789é", "123456789âe"));
604 YAZ_CHECK(tst_convert(cd, "1234567890é","1234567890âe"));
605 YAZ_CHECK(tst_convert(cd, "12345678901é", "12345678901âe"));
606 YAZ_CHECK(tst_convert(cd, "Cours de mathém", "Cours de mathâem"));
607 YAZ_CHECK(tst_convert(cd, "Cours de mathématiques",
608 "Cours de mathâematiques"));
612 static void tst_utf8_codes(void)
614 YAZ_CHECK(utf8_check(3));
615 YAZ_CHECK(utf8_check(127));
616 YAZ_CHECK(utf8_check(128));
617 YAZ_CHECK(utf8_check(255));
618 YAZ_CHECK(utf8_check(256));
619 YAZ_CHECK(utf8_check(900));
620 YAZ_CHECK(utf8_check(1000));
621 YAZ_CHECK(utf8_check(10000));
622 YAZ_CHECK(utf8_check(100000));
623 YAZ_CHECK(utf8_check(1000000));
624 YAZ_CHECK(utf8_check(10000000));
625 YAZ_CHECK(utf8_check(100000000));
628 int main (int argc, char **argv)
630 YAZ_CHECK_INIT(argc, argv);
636 tst_marc8s_to_utf8();
638 tst_marc8_to_latin1();
640 tst_advance_to_utf8();
641 tst_utf8_to_advance();
645 tst_latin1_to_marc8();
647 tst_marc8_to_ucs4b();
650 dconvert(1, "UTF-8");
651 dconvert(1, "ISO-8859-1");
653 dconvert(1, "UCS4LE");
654 dconvert(0, "CP865");
661 * indent-tabs-mode: nil
663 * vim: shiftwidth=4 tabstop=8 expandtab