2 * Copyright (C) 1995-2005, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.11 2005-02-07 11:23:18 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversion: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
55 struct yaz_iconv_struct {
58 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
59 size_t inbytesleft, size_t *no_read);
60 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
61 size_t inbytesleft, size_t *no_read);
62 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
63 char **outbuf, size_t *outbytesleft);
68 unsigned long comb_x[8];
69 size_t comb_no_read[8];
72 int marc8_comb_no_read;
81 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
82 size_t inbytesleft, size_t *no_read)
84 unsigned long x = inp[0];
89 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
90 size_t inbytesleft, size_t *no_read)
99 cd->my_errno = YAZ_ICONV_EINVAL;
102 if (inp[1] != 0xbb || inp[2] != 0xbf)
104 cd->my_errno = YAZ_ICONV_EILSEQ;
111 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
112 size_t inbytesleft, size_t *no_read)
121 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
124 cd->my_errno = YAZ_ICONV_EILSEQ;
126 else if (inp[0] <= 0xdf && inbytesleft >= 2)
128 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
134 cd->my_errno = YAZ_ICONV_EILSEQ;
137 else if (inp[0] <= 0xef && inbytesleft >= 3)
139 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
146 cd->my_errno = YAZ_ICONV_EILSEQ;
149 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
151 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
152 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
158 cd->my_errno = YAZ_ICONV_EILSEQ;
161 else if (inp[0] <= 0xfb && inbytesleft >= 5)
163 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
164 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
171 cd->my_errno = YAZ_ICONV_EILSEQ;
174 else if (inp[0] <= 0xfd && inbytesleft >= 6)
176 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
177 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
178 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
184 cd->my_errno = YAZ_ICONV_EILSEQ;
190 cd->my_errno = YAZ_ICONV_EINVAL;
195 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
196 size_t inbytesleft, size_t *no_read)
202 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
213 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
214 size_t inbytesleft, size_t *no_read)
220 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
225 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
232 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
233 size_t inbytesleft, size_t *no_read)
237 if (inbytesleft < sizeof(wchar_t))
239 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
245 memcpy (&wch, inp, sizeof(wch));
247 *no_read = sizeof(wch);
255 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
256 size_t inbytesleft, size_t *no_read,
259 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
260 size_t inbytesleft, size_t *no_read)
263 if (cd->comb_offset < cd->comb_size)
265 *no_read = cd->comb_no_read[cd->comb_offset];
266 x = cd->comb_x[cd->comb_offset];
272 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
275 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
278 cd->comb_x[cd->comb_size] = x;
279 cd->comb_no_read[cd->comb_size] = *no_read;
281 inbytesleft = inbytesleft - *no_read;
286 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
287 size_t inbytesleft, size_t *no_read,
291 while(inbytesleft >= 1 && inp[0] == 27)
293 size_t inbytesleft0 = inbytesleft;
296 while(inbytesleft > 0 && strchr("(,$!", *inp))
301 if (inbytesleft <= 0)
304 cd->my_errno = YAZ_ICONV_EINVAL;
307 cd->marc8_esc_mode = *inp++;
309 (*no_read) += inbytesleft0 - inbytesleft;
311 if (inbytesleft <= 0)
316 size_t no_read_sub = 0;
319 switch(cd->marc8_esc_mode)
321 case 'B': /* Basic ASCII */
322 case 'E': /* ANSEL */
323 case 's': /* ASCII */
324 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
326 case 'g': /* Greek */
327 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
329 case 'b': /* Subscripts */
330 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
332 case 'p': /* Superscripts */
333 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
335 case '2': /* Basic Hebrew */
336 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
338 case 'N': /* Basic Cyrillic */
339 case 'Q': /* Extended Cyrillic */
340 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
342 case '3': /* Basic Arabic */
343 case '4': /* Extended Arabic */
344 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
346 case 'S': /* Greek */
347 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
349 case '1': /* Chinese, Japanese, Korean (EACC) */
350 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
354 cd->my_errno = YAZ_ICONV_EILSEQ;
357 *no_read += no_read_sub;
362 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
363 size_t inbytesleft, size_t *no_read)
365 if (cd->marc8_comb_x)
367 unsigned long x = cd->marc8_comb_x;
368 *no_read = cd->marc8_comb_no_read;
369 cd->marc8_comb_x = 0;
373 while(inbytesleft >= 1 && inp[0] == 27)
375 size_t inbytesleft0 = inbytesleft;
378 while(inbytesleft > 0 && strchr("(,$!", *inp))
383 if (inbytesleft <= 0)
386 cd->my_errno = YAZ_ICONV_EINVAL;
389 cd->marc8_esc_mode = *inp++;
391 (*no_read) += inbytesleft0 - inbytesleft;
393 if (inbytesleft <= 0)
399 size_t no_read_sub = 0;
401 switch(cd->marc8_esc_mode)
403 case 'B': /* Basic ASCII */
404 case 'E': /* ANSEL */
405 case 's': /* ASCII */
406 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
408 case 'g': /* Greek */
409 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
411 case 'b': /* Subscripts */
412 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
414 case 'p': /* Superscripts */
415 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
417 case '2': /* Basic Hebrew */
418 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
420 case 'N': /* Basic Cyrillic */
421 case 'Q': /* Extended Cyrillic */
422 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
424 case '3': /* Basic Arabic */
425 case '4': /* Extended Arabic */
426 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
428 case 'S': /* Greek */
429 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
431 case '1': /* Chinese, Japanese, Korean (EACC) */
432 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
436 cd->my_errno = YAZ_ICONV_EILSEQ;
440 printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
442 *no_read += no_read_sub;
444 if (comb && cd->marc8_comb_x == 0)
447 unsigned long next_x;
449 /* read next char .. */
450 next_x = yaz_read_marc8(cd, inp + *no_read,
451 inbytesleft - *no_read, &tmp_read);
452 /* save this x for later .. */
453 cd->marc8_comb_x = x;
454 /* save next read for later .. */
455 cd->marc8_comb_no_read = tmp_read;
456 /* return next x - thereby swap */
464 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
465 char **outbuf, size_t *outbytesleft)
467 unsigned char *outp = (unsigned char *) *outbuf;
468 if (x <= 0x7f && *outbytesleft >= 1)
470 *outp++ = (unsigned char) x;
473 else if (x <= 0x7ff && *outbytesleft >= 2)
475 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
476 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
477 (*outbytesleft) -= 2;
479 else if (x <= 0xffff && *outbytesleft >= 3)
481 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
482 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
483 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
484 (*outbytesleft) -= 3;
486 else if (x <= 0x1fffff && *outbytesleft >= 4)
488 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
489 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
490 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
491 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
492 (*outbytesleft) -= 4;
494 else if (x <= 0x3ffffff && *outbytesleft >= 5)
496 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
497 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
498 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
499 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
500 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
501 (*outbytesleft) -= 5;
503 else if (*outbytesleft >= 6)
505 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
506 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
507 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
508 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
509 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
510 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
511 (*outbytesleft) -= 6;
515 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
518 *outbuf = (char *) outp;
522 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
523 char **outbuf, size_t *outbytesleft)
525 unsigned char *outp = (unsigned char *) *outbuf;
526 if (x > 255 || x < 1)
528 cd->my_errno = YAZ_ICONV_EILSEQ;
531 else if (*outbytesleft >= 1)
533 *outp++ = (unsigned char) x;
538 cd->my_errno = YAZ_ICONV_E2BIG;
541 *outbuf = (char *) outp;
546 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
547 char **outbuf, size_t *outbytesleft)
549 unsigned char *outp = (unsigned char *) *outbuf;
550 if (*outbytesleft >= 4)
552 *outp++ = (unsigned char) (x>>24);
553 *outp++ = (unsigned char) (x>>16);
554 *outp++ = (unsigned char) (x>>8);
555 *outp++ = (unsigned char) x;
556 (*outbytesleft) -= 4;
560 cd->my_errno = YAZ_ICONV_E2BIG;
563 *outbuf = (char *) outp;
567 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
568 char **outbuf, size_t *outbytesleft)
570 unsigned char *outp = (unsigned char *) *outbuf;
571 if (*outbytesleft >= 4)
573 *outp++ = (unsigned char) x;
574 *outp++ = (unsigned char) (x>>8);
575 *outp++ = (unsigned char) (x>>16);
576 *outp++ = (unsigned char) (x>>24);
577 (*outbytesleft) -= 4;
581 cd->my_errno = YAZ_ICONV_E2BIG;
584 *outbuf = (char *) outp;
589 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
590 char **outbuf, size_t *outbytesleft)
592 unsigned char *outp = (unsigned char *) *outbuf;
594 if (*outbytesleft >= sizeof(wchar_t))
597 memcpy(outp, &wch, sizeof(wch));
599 (*outbytesleft) -= sizeof(wch);
603 cd->my_errno = YAZ_ICONV_E2BIG;
606 *outbuf = (char *) outp;
611 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
613 return cd->read_handle && cd->write_handle;
616 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
618 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
620 cd->write_handle = 0;
623 cd->my_errno = YAZ_ICONV_UNKNOWN;
624 cd->marc8_esc_mode = 'B';
626 cd->comb_offset = cd->comb_size = 0;
628 cd->marc8_comb_x = 0;
631 /* a useful hack: if fromcode has leading @,
632 the library not use YAZ's own conversions .. */
633 if (fromcode[0] == '@')
637 if (!yaz_matchstr(fromcode, "UTF8"))
639 cd->read_handle = yaz_read_UTF8;
640 cd->init_handle = yaz_init_UTF8;
642 else if (!yaz_matchstr(fromcode, "ISO88591"))
643 cd->read_handle = yaz_read_ISO8859_1;
644 else if (!yaz_matchstr(fromcode, "UCS4"))
645 cd->read_handle = yaz_read_UCS4;
646 else if (!yaz_matchstr(fromcode, "UCS4LE"))
647 cd->read_handle = yaz_read_UCS4LE;
648 else if (!yaz_matchstr(fromcode, "MARC8"))
649 cd->read_handle = yaz_read_marc8;
651 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
652 cd->read_handle = yaz_read_wchar_t;
655 if (!yaz_matchstr(tocode, "UTF8"))
656 cd->write_handle = yaz_write_UTF8;
657 else if (!yaz_matchstr(tocode, "ISO88591"))
658 cd->write_handle = yaz_write_ISO8859_1;
659 else if (!yaz_matchstr (tocode, "UCS4"))
660 cd->write_handle = yaz_write_UCS4;
661 else if (!yaz_matchstr(tocode, "UCS4LE"))
662 cd->write_handle = yaz_write_UCS4LE;
664 else if (!yaz_matchstr(tocode, "WCHAR_T"))
665 cd->write_handle = yaz_write_wchar_t;
670 if (!cd->read_handle || !cd->write_handle)
672 cd->iconv_cd = iconv_open (tocode, fromcode);
673 if (cd->iconv_cd == (iconv_t) (-1))
680 if (!cd->read_handle || !cd->write_handle)
690 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
691 char **outbuf, size_t *outbytesleft)
699 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
700 if (r == (size_t)(-1))
705 cd->my_errno = YAZ_ICONV_E2BIG;
708 cd->my_errno = YAZ_ICONV_EINVAL;
711 cd->my_errno = YAZ_ICONV_EILSEQ;
714 cd->my_errno = YAZ_ICONV_UNKNOWN;
720 if (inbuf == 0 || *inbuf == 0)
723 cd->my_errno = YAZ_ICONV_UNKNOWN;
733 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
734 *inbytesleft, &no_read);
737 if (cd->my_errno == YAZ_ICONV_EINVAL)
742 *inbytesleft -= no_read;
754 if (*inbytesleft == 0)
761 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
772 no_read = cd->no_read_x;
776 r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
779 /* unable to write it. save it because read_handle cannot
782 cd->no_read_x = no_read;
787 *inbytesleft -= no_read;
793 int yaz_iconv_error (yaz_iconv_t cd)
798 int yaz_iconv_close (yaz_iconv_t cd)
802 iconv_close (cd->iconv_cd);