2 * Copyright (C) 1995-2005, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: tsticonv.c,v 1.17 2006-04-19 23:15:40 adam Exp $
17 #include <yaz/yaz-util.h>
20 static int compare_buffers(char *msg, int no,
21 int expect_len, const char *expect_buf,
22 int got_len, const char *got_buf)
24 if (expect_len == got_len
25 && !memcmp(expect_buf, got_buf, expect_len))
28 if (0) /* use 1 see how the buffers differ (for debug purposes) */
31 printf("tsticonv test=%s i=%d failed\n", msg, no);
32 printf("off got exp\n");
33 for (i = 0; i<got_len || i<expect_len; i++)
39 sprintf(got_char, "%02X", got_buf[i]);
41 sprintf(got_char, "? ");
44 sprintf(expect_char, "%02X", expect_buf[i]);
46 sprintf(expect_char, "? ");
48 printf("%02d %s %s %c\n",
49 i, got_char, expect_char, got_buf[i] == expect_buf[i] ?
57 /* some test strings in ISO-8859-1 format */
58 static const char *iso_8859_1_a[] = {
67 /* same test strings in MARC-8 format */
68 static const char *marc8_a[] = {
70 "\xa2", /* latin capital letter o with stroke */
71 "eneb\xb5r", /* latin small letter ae */
74 "\xea" "a" "\xea" "a",
78 static void tst_marc8_to_iso_8859_1()
84 cd = yaz_iconv_open("ISO-8859-1", "MARC8");
88 for (i = 0; iso_8859_1_a[i]; i++)
91 char *inbuf= (char*) marc8_a[i];
92 size_t inbytesleft = strlen(inbuf);
94 char *outbuf = outbuf0;
95 size_t outbytesleft = sizeof(outbuf0);
97 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
98 YAZ_CHECK(r != (size_t)(-1));
99 if (r == (size_t) (-1))
102 ret = compare_buffers("tsticonv 11", i,
103 strlen(iso_8859_1_a[i]), iso_8859_1_a[i],
104 outbuf - outbuf0, outbuf0);
110 static void tst_marc8_to_ucs4b()
118 "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o",
119 8, "\x00\x00\xFF\x1F" "\x00\x00\x00o"
121 "\033$1" "\x6F\x77\x29" /* AE0E */ "\x6F\x52\x7C" /* c0F4 */ "\033(B",
122 8, "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4",
125 "\x21\x50\x6E" /* UCS 7CFB */
126 "\x21\x51\x31" /* UCS 7D71 */
127 "\x21\x3A\x67" /* UCS 5B89 */
128 "\x21\x33\x22" /* UCS 5168 */
129 "\x21\x33\x53" /* UCS 5206 */
130 "\x21\x44\x2B" /* UCS 6790 */
132 24, "\x00\x00\x7C\xFB"
139 "\xB0\xB2", /* AYN and oSLASH */
140 8, "\x00\x00\x02\xBB" "\x00\x00\x00\xF8"
142 "\xF6\x61", /* a underscore */
143 8, "\x00\x00\x00\x61" "\x00\x00\x03\x32"
145 "\x61\xC2", /* a, phonorecord mark */
146 8, "\x00\x00\x00\x61" "\x00\x00\x21\x17"
149 "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */
162 12, "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08"
166 12, "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73"
170 12, "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73"
180 cd = yaz_iconv_open("UCS4", "MARC8");
184 for (i = 0; ar[i].len; i++)
187 size_t expect_len = ar[i].len;
188 char *inbuf= (char*) ar[i].marc8_b;
189 size_t inbytesleft = strlen(inbuf);
191 char *outbuf = outbuf0;
195 size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf;
196 if (outbytesleft > 12)
198 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
199 if (r == (size_t) (-1))
201 int e = yaz_iconv_error(cd);
202 YAZ_CHECK(e == YAZ_ICONV_E2BIG);
203 if (e != YAZ_ICONV_E2BIG)
209 ret = compare_buffers("tsticonv 22", i,
210 expect_len, ar[i].ucs4_b,
211 outbuf - outbuf0, outbuf0);
217 static void tst_ucs4b_to_utf8()
219 static const char *ucs4_c[] = {
220 "\x00\x00\xFF\x1F\x00\x00\x00o",
221 "\x00\x00\xAE\x0E\x00\x00\xC0\xF4",
224 static const char *utf8_c[] = {
226 "\xEA\xB8\x8E\xEC\x83\xB4",
234 cd = yaz_iconv_open("UTF8", "UCS4");
238 for (i = 0; ucs4_c[i]; i++)
241 char *inbuf= (char*) ucs4_c[i];
242 size_t inbytesleft = 8;
244 char *outbuf = outbuf0;
245 size_t outbytesleft = sizeof(outbuf0);
247 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
248 YAZ_CHECK(r != (size_t) (-1));
249 if (r == (size_t) (-1))
251 ret = compare_buffers("tsticonv 32", i,
252 strlen(utf8_c[i]), utf8_c[i],
253 outbuf - outbuf0, outbuf0);
259 static void dconvert(int mandatory, const char *tmpcode)
264 for (i = 0; iso_8859_1_a[i]; i++)
267 char *inbuf = (char*) iso_8859_1_a[i];
268 size_t inbytesleft = strlen(inbuf);
271 char *outbuf = outbuf0;
272 size_t outbytesleft = sizeof(outbuf0);
274 cd = yaz_iconv_open(tmpcode, "ISO-8859-1");
275 YAZ_CHECK(cd || !mandatory);
278 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
279 YAZ_CHECK(r != (size_t) (-1));
281 if (r == (size_t) (-1))
284 cd = yaz_iconv_open("ISO-8859-1", tmpcode);
285 YAZ_CHECK(cd || !mandatory);
289 inbytesleft = sizeof(outbuf0) - outbytesleft;
292 outbytesleft = sizeof(outbuf1);
293 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
294 YAZ_CHECK(r != (size_t) (-1));
295 if (r != (size_t)(-1))
297 ret = compare_buffers("dconvert", i,
298 strlen(iso_8859_1_a[i]), iso_8859_1_a[i],
299 sizeof(outbuf1) - outbytesleft, outbuf1);
306 int utf8_check(unsigned c)
315 size_t inbytesleft = 4;
316 char *outbuf = utf8buf;
317 size_t outbytesleft = sizeof(utf8buf);
319 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "UCS4LE");
322 for (i = 0; i<4; i++)
325 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
328 if (r == (size_t)(-1))
331 cd = yaz_iconv_open("UCS4LE", "UTF-8");
334 inbytesleft = sizeof(utf8buf) - outbytesleft;
340 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
341 if (r == (size_t)(-1))
346 if (memcmp(src, dst, 4))
352 static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf)
355 WRBUF b = wrbuf_alloc();
357 size_t inbytesleft = strlen(buf);
358 const char *inp = buf;
361 size_t outbytesleft = sizeof(outbuf);
363 size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft,
364 &outp, &outbytesleft);
365 if (r == (size_t) (-1))
367 int e = yaz_iconv_error(cd);
368 if (e != YAZ_ICONV_E2BIG)
371 wrbuf_write(b, outbuf, outp - outbuf);
373 if (wrbuf_len(b) == strlen(cmpbuf)
374 && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b)))
377 yaz_log(YLOG_LOG, "GOT (%.*s)", wrbuf_len(b), wrbuf_buf(b));
382 static void tst_conversion_marc8_to_latin1()
384 yaz_iconv_t cd = yaz_iconv_open("ISO-8859-1", "MARC8");
390 YAZ_CHECK(tst_convert(cd, "Cours de math",
392 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
394 YAZ_CHECK(tst_convert(cd, "12345678âe",
396 YAZ_CHECK(tst_convert(cd, "123456789âe",
398 YAZ_CHECK(tst_convert(cd, "1234567890âe",
400 YAZ_CHECK(tst_convert(cd, "12345678901âe",
402 YAZ_CHECK(tst_convert(cd, "Cours de mathâem",
404 YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques",
405 "Cours de mathématiques"));
410 static void tst_conversion_utf8_to_marc8()
412 yaz_iconv_t cd = yaz_iconv_open("MARC8", "UTF-8");
418 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
420 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
421 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
423 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
424 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
426 /** Pure ASCII. 13 characters (sizeof(outbuf)) */
427 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
429 /** UPPERCASE SCANDINAVIAN O */
430 YAZ_CHECK(tst_convert(cd, "S\xc3\x98", "S\xa2"));
433 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x8A", "\xEA" "A"));
435 /** A MACRON + UMLAUT, DIAERESIS */
436 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x84" "\xCC\x88",
439 /* Ligature spanning two characters */
440 YAZ_CHECK(tst_convert(cd,
441 "\x74" "\xCD\xA1" "\x73", /* UTF-8 */
442 "\xEB\x74\xEC\x73")); /* MARC-8 */
444 /* Double title spanning two characters */
445 YAZ_CHECK(tst_convert(cd,
446 "\x74" "\xCD\xA0" "\x73", /* UTF-8 */
447 "\xFA\x74\xFB\x73")); /* MARC-8 */
449 /** Ideographic question mark (Unicode FF1F) */
450 YAZ_CHECK(tst_convert(cd,
451 "\xEF\xBC\x9F" "o", /* UTF-8 */
452 "\033(1" "\x21\x2B\x3B" "\033(B" "o" ));
458 static void tst_conversion_latin1_to_marc8()
460 yaz_iconv_t cd = yaz_iconv_open("MARC8", "ISO-8859-1");
466 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
468 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
469 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
471 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
472 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
474 /** Pure ASCII. 13 characters (sizeof(outbuf)) */
475 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
477 /** UPPERCASE SCANDINAVIAN O */
478 YAZ_CHECK(tst_convert(cd, "SØ", "S\xa2"));
483 int main (int argc, char **argv)
485 YAZ_CHECK_INIT(argc, argv);
487 tst_conversion_marc8_to_latin1();
489 tst_conversion_utf8_to_marc8();
491 tst_conversion_latin1_to_marc8();
493 YAZ_CHECK(utf8_check(3));
494 YAZ_CHECK(utf8_check(127));
495 YAZ_CHECK(utf8_check(128));
496 YAZ_CHECK(utf8_check(255));
497 YAZ_CHECK(utf8_check(256));
498 YAZ_CHECK(utf8_check(900));
499 YAZ_CHECK(utf8_check(1000));
500 YAZ_CHECK(utf8_check(10000));
501 YAZ_CHECK(utf8_check(100000));
502 YAZ_CHECK(utf8_check(1000000));
503 YAZ_CHECK(utf8_check(10000000));
504 YAZ_CHECK(utf8_check(100000000));
506 dconvert(1, "UTF-8");
507 dconvert(1, "ISO-8859-1");
509 dconvert(1, "UCS4LE");
510 dconvert(0, "CP865");
511 tst_marc8_to_iso_8859_1();
512 tst_marc8_to_ucs4b();
520 * indent-tabs-mode: nil
522 * vim: shiftwidth=4 tabstop=8 expandtab