2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.34 2007-03-09 08:39:38 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
88 size_t (*flush_handle)(yaz_iconv_t cd,
89 char **outbuf, size_t *outbytesleft);
94 unsigned long comb_x[8];
95 size_t comb_no_read[8];
97 unsigned long unget_x;
101 unsigned long compose_char;
103 unsigned long write_marc8_comb_ch[8];
104 size_t write_marc8_comb_no;
105 unsigned write_marc8_second_half_char;
106 unsigned long write_marc8_last;
107 const char *write_marc8_page_chr;
111 unsigned long x1, x2;
114 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
121 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136 /* omitted: 0xd7 MULTIPLICATION SIGN */
137 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
138 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
144 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
145 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151 /* omitted: 0xe6 LATIN SMALL LETTER AE */
152 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
162 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168 /* omitted: 0xf7 DIVISION SIGN */
169 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
170 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175 /* omitted: 0xfe LATIN SMALL LETTER THORN */
176 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
184 unsigned long x = inp[0];
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191 size_t inbytesleft, size_t *no_read)
200 cd->my_errno = YAZ_ICONV_EINVAL;
203 if (inp[1] != 0xbb && inp[2] == 0xbf)
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211 size_t inbytesleft, size_t *no_read,
221 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
224 *error = YAZ_ICONV_EILSEQ;
226 else if (inp[0] <= 0xdf && inbytesleft >= 2)
228 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
234 *error = YAZ_ICONV_EILSEQ;
237 else if (inp[0] <= 0xef && inbytesleft >= 3)
239 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
246 *error = YAZ_ICONV_EILSEQ;
249 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
251 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
258 *error = YAZ_ICONV_EILSEQ;
261 else if (inp[0] <= 0xfb && inbytesleft >= 5)
263 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
271 *error = YAZ_ICONV_EILSEQ;
274 else if (inp[0] <= 0xfd && inbytesleft >= 6)
276 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
284 *error = YAZ_ICONV_EILSEQ;
290 *error = YAZ_ICONV_EINVAL;
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296 size_t inbytesleft, size_t *no_read)
298 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302 size_t inbytesleft, size_t *no_read)
308 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
313 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320 size_t inbytesleft, size_t *no_read)
326 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
331 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339 size_t inbytesleft, size_t *no_read)
343 if (inbytesleft < sizeof(wchar_t))
345 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
351 memcpy (&wch, inp, sizeof(wch));
353 *no_read = sizeof(wch);
360 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
361 size_t inbytesleft, size_t *no_read,
364 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
365 size_t inbytesleft, size_t *no_read)
368 if (cd->comb_offset < cd->comb_size)
370 *no_read = cd->comb_no_read[cd->comb_offset];
371 x = cd->comb_x[cd->comb_offset];
373 /* special case for double-diacritic combining characters,
374 INVERTED BREVE and DOUBLE TILDE.
375 We'll increment the no_read counter by 1, since we want to skip over
376 the processing of the closing ligature character
378 /* this code is no longer necessary.. our handlers code in
379 yaz_marc8_?_conv (generated by charconv.tcl) now returns
380 0 and no_read=1 when a sequence does not match the input.
381 The SECOND HALFs in codetables.xml produces a non-existant
382 entry in the conversion trie.. Hence when met, the input byte is
383 skipped as it should (in yaz_iconv)
386 if (x == 0x0361 || x == 0x0360)
394 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
397 if (inbytesleft == 0 && cd->comb_size)
399 cd->my_errno = YAZ_ICONV_EINVAL;
404 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
407 cd->comb_x[cd->comb_size] = x;
408 cd->comb_no_read[cd->comb_size] = *no_read;
410 inbytesleft = inbytesleft - *no_read;
415 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
416 size_t inbytesleft, size_t *no_read)
418 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
419 if (x && cd->comb_size == 1)
421 /* For MARC8s we try to get a Latin-1 page code out of it */
423 for (i = 0; latin1_comb[i].x1; i++)
424 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
426 *no_read += cd->comb_no_read[0];
428 x = latin1_comb[i].y;
435 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
436 size_t inbytesleft, size_t *no_read,
440 while(inbytesleft >= 1 && inp[0] == 27)
442 size_t inbytesleft0 = inbytesleft;
445 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
450 if (inbytesleft <= 0)
453 cd->my_errno = YAZ_ICONV_EINVAL;
456 cd->marc8_esc_mode = *inp++;
458 (*no_read) += inbytesleft0 - inbytesleft;
460 if (inbytesleft <= 0)
465 size_t no_read_sub = 0;
468 switch(cd->marc8_esc_mode)
470 case 'B': /* Basic ASCII */
471 case 'E': /* ANSEL */
472 case 's': /* ASCII */
473 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
475 case 'g': /* Greek */
476 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
478 case 'b': /* Subscripts */
479 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
481 case 'p': /* Superscripts */
482 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
484 case '2': /* Basic Hebrew */
485 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
487 case 'N': /* Basic Cyrillic */
488 case 'Q': /* Extended Cyrillic */
489 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
491 case '3': /* Basic Arabic */
492 case '4': /* Extended Arabic */
493 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
495 case 'S': /* Greek */
496 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
498 case '1': /* Chinese, Japanese, Korean (EACC) */
499 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
503 cd->my_errno = YAZ_ICONV_EILSEQ;
506 *no_read += no_read_sub;
511 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
512 char **outbuf, size_t *outbytesleft,
515 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
518 size_t yaz_write_UTF8_char(unsigned long x,
519 char **outbuf, size_t *outbytesleft,
522 unsigned char *outp = (unsigned char *) *outbuf;
524 if (x <= 0x7f && *outbytesleft >= 1)
526 *outp++ = (unsigned char) x;
529 else if (x <= 0x7ff && *outbytesleft >= 2)
531 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
532 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
533 (*outbytesleft) -= 2;
535 else if (x <= 0xffff && *outbytesleft >= 3)
537 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
538 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
539 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
540 (*outbytesleft) -= 3;
542 else if (x <= 0x1fffff && *outbytesleft >= 4)
544 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
545 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
546 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
547 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
548 (*outbytesleft) -= 4;
550 else if (x <= 0x3ffffff && *outbytesleft >= 5)
552 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
553 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
554 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
555 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
556 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
557 (*outbytesleft) -= 5;
559 else if (*outbytesleft >= 6)
561 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
562 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
563 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
564 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
565 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
566 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
567 (*outbytesleft) -= 6;
571 *error = YAZ_ICONV_E2BIG; /* not room for output */
574 *outbuf = (char *) outp;
579 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
580 char **outbuf, size_t *outbytesleft,
583 /* list of two char unicode sequence that, when combined, are
584 equivalent to single unicode chars that can be represented in
586 Regular iconv on Linux at least does not seem to convert these,
587 but since MARC-8 to UTF-8 generates these composed sequence
588 we get a better chance of a successful MARC-8 -> ISO-8859-1
590 unsigned char *outp = (unsigned char *) *outbuf;
592 if (cd->compose_char)
595 for (i = 0; latin1_comb[i].x1; i++)
596 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
598 x = latin1_comb[i].y;
601 if (*outbytesleft < 1)
602 { /* no room. Retain compose_char and bail out */
603 cd->my_errno = YAZ_ICONV_E2BIG;
606 if (!latin1_comb[i].x1)
607 { /* not found. Just write compose_char */
608 *outp++ = (unsigned char) cd->compose_char;
610 *outbuf = (char *) outp;
612 /* compose_char used so reset it. x now holds current char */
613 cd->compose_char = 0;
616 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
618 cd->compose_char = x;
621 else if (x > 255 || x < 1)
623 cd->my_errno = YAZ_ICONV_EILSEQ;
626 else if (*outbytesleft < 1)
628 cd->my_errno = YAZ_ICONV_E2BIG;
631 *outp++ = (unsigned char) x;
633 *outbuf = (char *) outp;
638 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
639 char **outbuf, size_t *outbytesleft,
642 unsigned char *outp = (unsigned char *) *outbuf;
643 if (*outbytesleft >= 4)
645 *outp++ = (unsigned char) (x>>24);
646 *outp++ = (unsigned char) (x>>16);
647 *outp++ = (unsigned char) (x>>8);
648 *outp++ = (unsigned char) x;
649 (*outbytesleft) -= 4;
653 cd->my_errno = YAZ_ICONV_E2BIG;
656 *outbuf = (char *) outp;
660 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
661 char **outbuf, size_t *outbytesleft,
664 unsigned char *outp = (unsigned char *) *outbuf;
665 if (*outbytesleft >= 4)
667 *outp++ = (unsigned char) x;
668 *outp++ = (unsigned char) (x>>8);
669 *outp++ = (unsigned char) (x>>16);
670 *outp++ = (unsigned char) (x>>24);
671 (*outbytesleft) -= 4;
675 cd->my_errno = YAZ_ICONV_E2BIG;
678 *outbuf = (char *) outp;
682 static unsigned long lookup_marc8(yaz_iconv_t cd,
683 unsigned long x, int *comb,
684 const char **page_chr)
687 char *utf8_outbuf = utf8_buf;
688 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
690 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
691 if (r == (size_t)(-1))
693 cd->my_errno = YAZ_ICONV_EILSEQ;
699 size_t inbytesleft, no_read_sub = 0;
703 inp = (unsigned char *) utf8_buf;
704 inbytesleft = strlen(utf8_buf);
706 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
709 *page_chr = "\033(B";
712 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
718 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
724 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
730 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
733 *page_chr = "\033(2";
736 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
739 *page_chr = "\033(N";
742 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
745 *page_chr = "\033(3";
748 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
751 *page_chr = "\033(S";
754 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
757 *page_chr = "\033$1";
760 cd->my_errno = YAZ_ICONV_EILSEQ;
765 static size_t flush_combos(yaz_iconv_t cd,
766 char **outbuf, size_t *outbytesleft)
768 unsigned long y = cd->write_marc8_last;
771 size_t i, out_no = 0;
776 byte = (unsigned char )((y>>16) & 0xff);
778 out_buf[out_no++] = byte;
779 byte = (unsigned char)((y>>8) & 0xff);
781 out_buf[out_no++] = byte;
782 byte = (unsigned char )(y & 0xff);
784 out_buf[out_no++] = byte;
786 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
788 cd->my_errno = YAZ_ICONV_E2BIG;
789 return (size_t) (-1);
792 for (i = 0; i < cd->write_marc8_comb_no; i++)
794 /* all MARC-8 combined characters are simple bytes */
795 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
799 memcpy(*outbuf, out_buf, out_no);
801 (*outbytesleft) -= out_no;
802 if (cd->write_marc8_second_half_char)
804 *(*outbuf)++ = cd->write_marc8_second_half_char;
808 cd->write_marc8_last = 0;
809 cd->write_marc8_comb_no = 0;
810 cd->write_marc8_second_half_char = 0;
814 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
815 char **outbuf, size_t *outbytesleft,
819 const char *page_chr = 0;
820 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
823 return (size_t) (-1);
828 cd->write_marc8_second_half_char = 0xEC;
829 else if (x == 0x0360)
830 cd->write_marc8_second_half_char = 0xFB;
832 if (cd->write_marc8_comb_no < 6)
833 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
837 size_t r = flush_combos(cd, outbuf, outbytesleft);
838 const char *old_page_chr = cd->write_marc8_page_chr;
841 if (strcmp(page_chr, old_page_chr))
844 const char *page_out = page_chr;
846 if (*outbytesleft < 8)
848 cd->my_errno = YAZ_ICONV_E2BIG;
850 return (size_t) (-1);
852 cd->write_marc8_page_chr = page_chr;
854 if (!strcmp(old_page_chr, "\033p")
855 || !strcmp(old_page_chr, "\033g")
856 || !strcmp(old_page_chr, "\033b"))
858 /* Technique 1 leave */
860 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
862 /* Must leave script + enter new page */
863 plen = strlen(page_out);
864 memcpy(*outbuf, page_out, plen);
866 (*outbytesleft) -= plen;
870 plen = strlen(page_out);
871 memcpy(*outbuf, page_out, plen);
873 (*outbytesleft) -= plen;
875 cd->write_marc8_last = y;
879 size_t r = flush_combos(cd, outbuf, outbytesleft);
883 cd->write_marc8_comb_no--;
885 cd->write_marc8_last = 0;
892 static size_t yaz_flush_marc8(yaz_iconv_t cd,
893 char **outbuf, size_t *outbytesleft)
895 if (strcmp(cd->write_marc8_page_chr, "\033(B"))
897 if (*outbytesleft < 3)
899 cd->my_errno = YAZ_ICONV_E2BIG;
900 return (size_t) (-1);
902 memcpy(*outbuf, "\033(B", 3);
909 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
910 char **outbuf, size_t *outbytesleft,
914 for (i = 0; latin1_comb[i].x1; i++)
916 if (x == latin1_comb[i].y)
919 /* save the output pointers .. */
920 char *outbuf0 = *outbuf;
921 size_t outbytesleft0 = *outbytesleft;
922 int last_ch = cd->write_marc8_last;
924 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
925 outbuf, outbytesleft, 0);
928 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
929 outbuf, outbytesleft, last);
930 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
932 /* not enough room. reset output to original values */
934 *outbytesleft = outbytesleft0;
935 cd->write_marc8_last = last_ch;
940 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
945 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
946 char **outbuf, size_t *outbytesleft,
949 unsigned char *outp = (unsigned char *) *outbuf;
951 if (*outbytesleft >= sizeof(wchar_t))
954 memcpy(outp, &wch, sizeof(wch));
956 (*outbytesleft) -= sizeof(wch);
960 cd->my_errno = YAZ_ICONV_E2BIG;
963 *outbuf = (char *) outp;
968 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
970 return cd->read_handle && cd->write_handle;
973 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
975 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
977 cd->write_handle = 0;
980 cd->flush_handle = 0;
981 cd->my_errno = YAZ_ICONV_UNKNOWN;
983 /* a useful hack: if fromcode has leading @,
984 the library not use YAZ's own conversions .. */
985 if (fromcode[0] == '@')
989 if (!yaz_matchstr(fromcode, "UTF8"))
991 cd->read_handle = yaz_read_UTF8;
992 cd->init_handle = yaz_init_UTF8;
994 else if (!yaz_matchstr(fromcode, "ISO88591"))
995 cd->read_handle = yaz_read_ISO8859_1;
996 else if (!yaz_matchstr(fromcode, "UCS4"))
997 cd->read_handle = yaz_read_UCS4;
998 else if (!yaz_matchstr(fromcode, "UCS4LE"))
999 cd->read_handle = yaz_read_UCS4LE;
1000 else if (!yaz_matchstr(fromcode, "MARC8"))
1001 cd->read_handle = yaz_read_marc8;
1002 else if (!yaz_matchstr(fromcode, "MARC8s"))
1003 cd->read_handle = yaz_read_marc8s;
1005 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1006 cd->read_handle = yaz_read_wchar_t;
1009 if (!yaz_matchstr(tocode, "UTF8"))
1010 cd->write_handle = yaz_write_UTF8;
1011 else if (!yaz_matchstr(tocode, "ISO88591"))
1012 cd->write_handle = yaz_write_ISO8859_1;
1013 else if (!yaz_matchstr (tocode, "UCS4"))
1014 cd->write_handle = yaz_write_UCS4;
1015 else if (!yaz_matchstr(tocode, "UCS4LE"))
1016 cd->write_handle = yaz_write_UCS4LE;
1017 else if (!yaz_matchstr(tocode, "MARC8"))
1019 cd->write_handle = yaz_write_marc8;
1020 cd->flush_handle = yaz_flush_marc8;
1022 else if (!yaz_matchstr(tocode, "MARC8s"))
1024 cd->write_handle = yaz_write_marc8;
1025 cd->flush_handle = yaz_flush_marc8;
1028 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1029 cd->write_handle = yaz_write_wchar_t;
1034 if (!cd->read_handle || !cd->write_handle)
1036 cd->iconv_cd = iconv_open (tocode, fromcode);
1037 if (cd->iconv_cd == (iconv_t) (-1))
1044 if (!cd->read_handle || !cd->write_handle)
1054 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1055 char **outbuf, size_t *outbytesleft)
1064 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1065 if (r == (size_t)(-1))
1067 switch (yaz_errno())
1070 cd->my_errno = YAZ_ICONV_E2BIG;
1073 cd->my_errno = YAZ_ICONV_EINVAL;
1076 cd->my_errno = YAZ_ICONV_EILSEQ;
1079 cd->my_errno = YAZ_ICONV_UNKNOWN;
1091 cd->my_errno = YAZ_ICONV_UNKNOWN;
1092 cd->marc8_esc_mode = 'B';
1094 cd->comb_offset = cd->comb_size = 0;
1095 cd->compose_char = 0;
1097 cd->write_marc8_comb_no = 0;
1098 cd->write_marc8_second_half_char = 0;
1099 cd->write_marc8_last = 0;
1100 cd->write_marc8_page_chr = "\033(B";
1108 if (cd->init_handle && inbuf && *inbuf)
1111 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1112 *inbytesleft, &no_read);
1115 if (cd->my_errno == YAZ_ICONV_EINVAL)
1120 *inbytesleft -= no_read;
1134 no_read = cd->no_read_x;
1136 else if (inbuf && *inbuf)
1138 if (*inbytesleft == 0)
1140 r = *inbuf - inbuf0;
1143 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1154 if (cd->flush_handle && outbuf && *outbuf)
1155 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1162 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1163 (*inbytesleft - no_read) == 0 ? 1 : 0);
1166 /* unable to write it. save it because read_handle cannot
1168 if (cd->my_errno == YAZ_ICONV_E2BIG)
1171 cd->no_read_x = no_read;
1177 *inbytesleft -= no_read;
1178 (*inbuf) += no_read;
1183 int yaz_iconv_error (yaz_iconv_t cd)
1185 return cd->my_errno;
1188 int yaz_iconv_close (yaz_iconv_t cd)
1192 iconv_close (cd->iconv_cd);
1201 * indent-tabs-mode: nil
1203 * vim: shiftwidth=4 tabstop=8 expandtab