2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: tsticonv.c,v 1.35 2008-03-12 08:53:28 adam Exp $
17 #include <yaz/yaz-util.h>
22 static int compare_buffers(char *msg, int no,
23 int expect_len, const char *expect_buf,
24 int got_len, const char *got_buf)
26 if (expect_len == got_len
27 && !memcmp(expect_buf, got_buf, expect_len))
30 if (0) /* use 1 see how the buffers differ (for debug purposes) */
33 printf("tsticonv test=%s i=%d failed\n", msg, no);
34 printf("off got exp\n");
35 for (i = 0; i<got_len || i<expect_len; i++)
41 sprintf(got_char, "%02X", got_buf[i]);
43 sprintf(got_char, "? ");
46 sprintf(expect_char, "%02X", expect_buf[i]);
48 sprintf(expect_char, "? ");
50 printf("%02d %s %s %c\n",
51 i, got_char, expect_char, got_buf[i] == expect_buf[i] ?
59 static int tst_convert_l(yaz_iconv_t cd, size_t in_len, const char *in_buf,
60 size_t expect_len, const char *expect_buf)
63 char *inbuf= (char*) in_buf;
64 size_t inbytesleft = in_len > 0 ? in_len : strlen(in_buf);
66 char *outbuf = outbuf0;
70 size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf;
71 if (outbytesleft > 12)
73 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
74 if (r == (size_t) (-1))
76 int e = yaz_iconv_error(cd);
77 if (e != YAZ_ICONV_E2BIG)
82 yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
87 return compare_buffers("tsticonv 22", 0,
88 expect_len, expect_buf,
89 outbuf - outbuf0, outbuf0);
92 static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf)
95 WRBUF b = wrbuf_alloc();
97 size_t inbytesleft = strlen(buf);
98 const char *inp = buf;
100 for (rounds = 0; inbytesleft && rounds < sizeof(outbuf); rounds++)
102 size_t outbytesleft = sizeof(outbuf);
104 size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft,
105 &outp, &outbytesleft);
106 wrbuf_write(b, outbuf, outp - outbuf);
107 if (r == (size_t) (-1))
109 int e = yaz_iconv_error(cd);
110 if (e != YAZ_ICONV_E2BIG)
115 size_t outbytesleft = sizeof(outbuf);
117 r = yaz_iconv(cd, 0, 0, &outp, &outbytesleft);
118 wrbuf_write(b, outbuf, outp - outbuf);
122 if (wrbuf_len(b) == strlen(cmpbuf)
123 && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b)))
127 WRBUF w = wrbuf_alloc();
130 wrbuf_puts_escaped(w, buf);
131 yaz_log(YLOG_LOG, "input %s", wrbuf_cstr(w));
134 wrbuf_write_escaped(w, wrbuf_buf(b), wrbuf_len(b));
135 yaz_log(YLOG_LOG, "got %s", wrbuf_cstr(w));
138 wrbuf_puts_escaped(w, cmpbuf);
139 yaz_log(YLOG_LOG, "exp %s", wrbuf_cstr(w));
149 /* some test strings in ISO-8859-1 format */
150 static const char *iso_8859_1_a[] = {
159 static void tst_marc8_to_ucs4b(void)
161 yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8");
166 YAZ_CHECK(tst_convert_l(
169 "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o",
171 "\x00\x00\xFF\x1F" "\x00\x00\x00o"));
172 YAZ_CHECK(tst_convert_l(
175 "\033$1" "\x6F\x77\x29" /* AE0E */
176 "\x6F\x52\x7C" /* c0F4 */ "\033(B",
178 "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4"));
179 YAZ_CHECK(tst_convert_l(
183 "\x21\x50\x6E" /* UCS 7CFB */
184 "\x21\x51\x31" /* UCS 7D71 */
185 "\x21\x3A\x67" /* UCS 5B89 */
186 "\x21\x33\x22" /* UCS 5168 */
187 "\x21\x33\x53" /* UCS 5206 */
188 "\x21\x44\x2B" /* UCS 6790 */
196 "\x00\x00\x67\x90"));
198 YAZ_CHECK(tst_convert_l(
201 "\xB0\xB2", /* AYN and oSLASH */
203 "\x00\x00\x02\xBB" "\x00\x00\x00\xF8"));
204 YAZ_CHECK(tst_convert_l(
207 "\xF6\x61", /* a underscore */
209 "\x00\x00\x00\x61" "\x00\x00\x03\x32"));
211 YAZ_CHECK(tst_convert_l(
214 "\x61\xC2", /* a, phonorecord mark */
216 "\x00\x00\x00\x61" "\x00\x00\x21\x17"));
219 YAZ_CHECK(tst_convert_l(
222 "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */
231 "\x00\x00\x00" "n"));
233 YAZ_CHECK(tst_convert_l(
238 "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08"));
240 YAZ_CHECK(tst_convert_l(
245 "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73"));
247 YAZ_CHECK(tst_convert_l(
252 "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73"));
257 static void tst_ucs4b_to_utf8(void)
259 yaz_iconv_t cd = yaz_iconv_open("UTF8", "UCS4");
263 YAZ_CHECK(tst_convert_l(
266 "\x00\x00\xFF\x1F\x00\x00\x00o",
268 "\xEF\xBC\x9F\x6F"));
270 YAZ_CHECK(tst_convert_l(
273 "\x00\x00\xAE\x0E\x00\x00\xC0\xF4",
275 "\xEA\xB8\x8E\xEC\x83\xB4"));
279 static void dconvert(int mandatory, const char *tmpcode)
284 for (i = 0; iso_8859_1_a[i]; i++)
287 char *inbuf = (char*) iso_8859_1_a[i];
288 size_t inbytesleft = strlen(inbuf);
291 char *outbuf = outbuf0;
292 size_t outbytesleft = sizeof(outbuf0);
294 cd = yaz_iconv_open(tmpcode, "ISO-8859-1");
295 YAZ_CHECK(cd || !mandatory);
298 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
299 YAZ_CHECK(r != (size_t) (-1));
301 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
302 YAZ_CHECK(r != (size_t) (-1));
304 if (r == (size_t) (-1))
307 cd = yaz_iconv_open("ISO-8859-1", tmpcode);
308 YAZ_CHECK(cd || !mandatory);
312 inbytesleft = sizeof(outbuf0) - outbytesleft;
315 outbytesleft = sizeof(outbuf1);
316 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
317 YAZ_CHECK(r != (size_t) (-1));
319 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
320 if (r == (size_t)(-1))
322 fprintf(stderr, "failed\n");
324 YAZ_CHECK(r != (size_t) (-1));
326 if (r != (size_t)(-1))
328 ret = compare_buffers("dconvert", i,
329 strlen(iso_8859_1_a[i]), iso_8859_1_a[i],
330 sizeof(outbuf1) - outbytesleft, outbuf1);
337 int utf8_check(unsigned c)
346 size_t inbytesleft = 4;
347 char *outbuf = utf8buf;
348 size_t outbytesleft = sizeof(utf8buf);
350 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "UCS4LE");
353 for (i = 0; i<4; i++)
356 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
359 if (r == (size_t)(-1))
362 cd = yaz_iconv_open("UCS4LE", "UTF-8");
365 inbytesleft = sizeof(utf8buf) - outbytesleft;
371 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
372 if (r == (size_t)(-1))
377 if (memcmp(src, dst, 4))
383 static void tst_marc8_to_utf8(void)
385 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8");
392 YAZ_CHECK(tst_convert(cd, ESC "(N" ESC ")Qp" ESC "(B", "\xd0\x9f"));
395 YAZ_CHECK(tst_convert(cd, "Cours de math",
397 /* COMBINING ACUTE ACCENT */
398 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
399 "Cours de mathe\xcc\x81"));
401 YAZ_CHECK(tst_convert(cd, "a\xea\x1e", "a\x1e\xcc\x8a"));
403 YAZ_CHECK(tst_convert(cd, "a\xea", "a"));
408 static void tst_marc8s_to_utf8(void)
410 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8s");
416 YAZ_CHECK(tst_convert(cd, "Cours de math",
418 /* E9: LATIN SMALL LETTER E WITH ACUTE */
419 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
420 "Cours de math\xc3\xa9"));
426 static void tst_marc8_to_latin1(void)
428 yaz_iconv_t cd = yaz_iconv_open("ISO-8859-1", "MARC8");
434 YAZ_CHECK(tst_convert(cd, "ax", "ax"));
436 /* latin capital letter o with stroke */
437 YAZ_CHECK(tst_convert(cd, "\xa2", "\xd8"));
439 /* with latin small letter ae */
440 YAZ_CHECK(tst_convert(cd, "eneb\xb5r", "eneb\346r"));
442 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2", "\xe5" "\xd8"));
444 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2" "b", "\xe5" "\xd8" "b"));
446 YAZ_CHECK(tst_convert(cd, "\xea" "a" "\xea" "a", "\xe5" "\xe5"));
448 YAZ_CHECK(tst_convert(cd, "Cours de math",
450 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
452 YAZ_CHECK(tst_convert(cd, "12345678âe",
454 YAZ_CHECK(tst_convert(cd, "123456789âe",
456 YAZ_CHECK(tst_convert(cd, "1234567890âe",
458 YAZ_CHECK(tst_convert(cd, "12345678901âe",
460 YAZ_CHECK(tst_convert(cd, "Cours de mathâem",
462 YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques",
463 "Cours de mathématiques"));
468 static void tst_utf8_to_marc8(void)
470 yaz_iconv_t cd = yaz_iconv_open("MARC8", "UTF-8");
476 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
478 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
479 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
481 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
482 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
484 /** Pure ASCII. 13 characters (sizeof(outbuf)+1) */
485 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
487 /** UPPERCASE SCANDINAVIAN O */
488 YAZ_CHECK(tst_convert(cd, "S\xc3\x98", "S\xa2"));
491 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x8A", "\xEA" "A"));
493 /** A MACRON + UMLAUT, DIAERESIS */
494 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x84" "\xCC\x88",
497 /* Ligature spanning two characters */
498 YAZ_CHECK(tst_convert(cd,
499 "\x74" "\xCD\xA1" "\x73", /* UTF-8 */
500 "\xEB\x74\xEC\x73")); /* MARC-8 */
502 /* Double title spanning two characters */
503 YAZ_CHECK(tst_convert(cd,
504 "\x74" "\xCD\xA0" "\x73", /* UTF-8 */
505 "\xFA\x74\xFB\x73")); /* MARC-8 */
507 /** Ideographic question mark (Unicode FF1F) */
508 YAZ_CHECK(tst_convert(cd,
509 "\xEF\xBC\x9F" "o", /* UTF-8 */
510 "\033$1" "\x21\x2B\x3B" "\033(B" "o" ));
513 /** Ideographic space per ANSI Z39.64 */
514 YAZ_CHECK(tst_convert(cd,
515 "\xe3\x80\x80" "o", /* UTF-8 */
516 "\033$1" "\x21\x23\x21" "\033(B" "o" ));
518 /** Superscript 0 . bug #642 */
519 YAZ_CHECK(tst_convert(cd,
520 "(\xe2\x81\xb0)", /* UTF-8 */
525 YAZ_CHECK(tst_convert(cd,
526 /* offset 0x530 in UTF-8 rec marccol4.u8.marc */
527 "\xE3\x83\xB3" "\xE3\x82\xBF"
528 "\xCC\x84" "\xCC\x84" "\xE3\x83\xBC" /* UTF-8 */,
529 "\x1B\x24\x31" "\x69\x25\x73"
530 "\x1B\x28\x42" "\xE5\xE5" "\x1B\x24\x31"
532 "\x69\x21\x3C" "\x1B\x28\x42"));
536 YAZ_CHECK(tst_convert(cd,
537 "\xCE\x94\xCE\xB5\xCF\x84"
538 "\xCE\xBF\xCF\x81\xCE\xB1"
539 "\xCE\xBA\xCE\xB7\xCF\x82\x2C",
541 "\x1B\x28\x53\x45\x66\x78\x72\x75"
547 char *inbuf0 = "\xe2\x81\xb0";
548 char *inbuf = inbuf0;
549 size_t inbytesleft = strlen(inbuf);
551 char *outbuf = outbuf0;
552 size_t outbytesleft = sizeof(outbuf0)-1;
557 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
558 YAZ_CHECK(r != (size_t) (-1));
561 *outbuf = '\0'; /* so we know when to stop printing */
562 for (i = 0; outbuf0[i]; i++)
564 int ch = outbuf0[i] & 0xff;
565 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
569 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
570 YAZ_CHECK(r != (size_t) (-1));
571 *outbuf = '\0'; /* for strcmp test below and printing */
573 for (i = 0; outbuf0[i]; i++)
575 int ch = outbuf0[i] & 0xff;
576 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
579 YAZ_CHECK(strcmp("\033p0\x1bs", outbuf0) == 0);
584 static void tst_advance_to_utf8(void)
586 yaz_iconv_t cd = yaz_iconv_open("utf-8", "advancegreek");
592 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
596 static void tst_utf8_to_advance(void)
598 yaz_iconv_t cd = yaz_iconv_open("advancegreek", "utf-8");
604 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
608 static void tst_latin1_to_marc8(void)
610 yaz_iconv_t cd = yaz_iconv_open("MARC8", "ISO-8859-1");
616 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
618 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
619 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
621 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
622 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
624 /** Pure ASCII. 13 characters (sizeof(outbuf)) */
625 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
627 /** D8: UPPERCASE SCANDINAVIAN O */
628 YAZ_CHECK(tst_convert(cd, "S\xd8", "S\xa2"));
630 /** E9: LATIN SMALL LETTER E WITH ACUTE */
631 YAZ_CHECK(tst_convert(cd, "Cours de math\xe9", "Cours de mathâe"));
632 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"
634 YAZ_CHECK(tst_convert(cd, "Cours de mathé", "Cours de mathâe" ));
635 YAZ_CHECK(tst_convert(cd, "12345678é","12345678âe"));
636 YAZ_CHECK(tst_convert(cd, "123456789é", "123456789âe"));
637 YAZ_CHECK(tst_convert(cd, "1234567890é","1234567890âe"));
638 YAZ_CHECK(tst_convert(cd, "12345678901é", "12345678901âe"));
639 YAZ_CHECK(tst_convert(cd, "Cours de mathém", "Cours de mathâem"));
640 YAZ_CHECK(tst_convert(cd, "Cours de mathématiques",
641 "Cours de mathâematiques"));
645 static void tst_utf8_codes(void)
647 YAZ_CHECK(utf8_check(3));
648 YAZ_CHECK(utf8_check(127));
649 YAZ_CHECK(utf8_check(128));
650 YAZ_CHECK(utf8_check(255));
651 YAZ_CHECK(utf8_check(256));
652 YAZ_CHECK(utf8_check(900));
653 YAZ_CHECK(utf8_check(1000));
654 YAZ_CHECK(utf8_check(10000));
655 YAZ_CHECK(utf8_check(100000));
656 YAZ_CHECK(utf8_check(1000000));
657 YAZ_CHECK(utf8_check(10000000));
658 YAZ_CHECK(utf8_check(100000000));
661 int main (int argc, char **argv)
663 YAZ_CHECK_INIT(argc, argv);
669 tst_marc8s_to_utf8();
671 tst_marc8_to_latin1();
673 tst_advance_to_utf8();
674 tst_utf8_to_advance();
678 tst_latin1_to_marc8();
680 tst_marc8_to_ucs4b();
683 dconvert(1, "UTF-8");
684 dconvert(1, "ISO-8859-1");
686 dconvert(1, "UCS4LE");
687 dconvert(0, "CP865");
694 * indent-tabs-mode: nil
696 * vim: shiftwidth=4 tabstop=8 expandtab