2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.27 2006-08-28 12:34:41 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
92 unsigned long comb_x[8];
93 size_t comb_no_read[8];
95 unsigned long unget_x;
99 unsigned long compose_char;
101 unsigned long write_marc8_comb_ch[8];
102 size_t write_marc8_comb_no;
103 unsigned long write_marc8_last;
104 const char *write_marc8_page_chr;
108 unsigned long x1, x2;
111 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
118 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133 /* omitted: 0xd7 MULTIPLICATION SIGN */
134 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
135 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
141 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
142 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148 /* omitted: 0xe6 LATIN SMALL LETTER AE */
149 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
159 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165 /* omitted: 0xf7 DIVISION SIGN */
166 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
167 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172 /* omitted: 0xfe LATIN SMALL LETTER THORN */
173 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179 size_t inbytesleft, size_t *no_read)
181 unsigned long x = inp[0];
186 static size_t yaz_init_marc8(yaz_iconv_t cd, unsigned char *inp,
187 size_t inbytesleft, size_t *no_read)
189 cd->marc8_esc_mode = 'B';
191 cd->comb_offset = cd->comb_size = 0;
192 cd->compose_char = 0;
194 cd->write_marc8_comb_no = 0;
195 cd->write_marc8_last = 0;
196 cd->write_marc8_page_chr = "\033(B";
201 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
202 size_t inbytesleft, size_t *no_read)
211 cd->my_errno = YAZ_ICONV_EINVAL;
214 if (inp[1] != 0xbb && inp[2] == 0xbf)
221 unsigned long yaz_read_UTF8_char(unsigned char *inp,
222 size_t inbytesleft, size_t *no_read,
232 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
235 *error = YAZ_ICONV_EILSEQ;
237 else if (inp[0] <= 0xdf && inbytesleft >= 2)
239 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
245 *error = YAZ_ICONV_EILSEQ;
248 else if (inp[0] <= 0xef && inbytesleft >= 3)
250 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
257 *error = YAZ_ICONV_EILSEQ;
260 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
262 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
263 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
269 *error = YAZ_ICONV_EILSEQ;
272 else if (inp[0] <= 0xfb && inbytesleft >= 5)
274 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
275 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
282 *error = YAZ_ICONV_EILSEQ;
285 else if (inp[0] <= 0xfd && inbytesleft >= 6)
287 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
288 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
289 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
295 *error = YAZ_ICONV_EILSEQ;
301 *error = YAZ_ICONV_EINVAL;
306 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
307 size_t inbytesleft, size_t *no_read)
309 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
312 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
313 size_t inbytesleft, size_t *no_read)
319 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
324 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
330 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
331 size_t inbytesleft, size_t *no_read)
337 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
342 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
349 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
350 size_t inbytesleft, size_t *no_read)
354 if (inbytesleft < sizeof(wchar_t))
356 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
362 memcpy (&wch, inp, sizeof(wch));
364 *no_read = sizeof(wch);
371 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
372 size_t inbytesleft, size_t *no_read,
375 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
376 size_t inbytesleft, size_t *no_read)
379 if (cd->comb_offset < cd->comb_size)
381 *no_read = cd->comb_no_read[cd->comb_offset];
382 x = cd->comb_x[cd->comb_offset];
384 /* special case for double-diacritic combining characters,
385 INVERTED BREVE and DOUBLE TILDE.
386 We'll increment the no_read counter by 1, since we want to skip over
387 the processing of the closing ligature character
389 /* this code is no longer necessary.. our handlers code in
390 yaz_marc8_?_conv (generated by charconv.tcl) now returns
391 0 and no_read=1 when a sequence does not match the input.
392 The SECOND HALFs in codetables.xml produces a non-existant
393 entry in the conversion trie.. Hence when met, the input byte is
394 skipped as it should (in yaz_iconv)
397 if (x == 0x0361 || x == 0x0360)
405 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
408 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
411 cd->comb_x[cd->comb_size] = x;
412 cd->comb_no_read[cd->comb_size] = *no_read;
414 inbytesleft = inbytesleft - *no_read;
419 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
420 size_t inbytesleft, size_t *no_read)
422 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
423 if (x && cd->comb_size == 1)
425 /* For MARC8s we try to get a Latin-1 page code out of it */
427 for (i = 0; latin1_comb[i].x1; i++)
428 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
430 *no_read += cd->comb_no_read[0];
432 x = latin1_comb[i].y;
439 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
440 size_t inbytesleft, size_t *no_read,
444 while(inbytesleft >= 1 && inp[0] == 27)
446 size_t inbytesleft0 = inbytesleft;
449 while(inbytesleft > 0 && strchr("(,$!", *inp))
454 if (inbytesleft <= 0)
457 cd->my_errno = YAZ_ICONV_EINVAL;
460 cd->marc8_esc_mode = *inp++;
462 (*no_read) += inbytesleft0 - inbytesleft;
464 if (inbytesleft <= 0)
469 size_t no_read_sub = 0;
472 switch(cd->marc8_esc_mode)
474 case 'B': /* Basic ASCII */
475 case 'E': /* ANSEL */
476 case 's': /* ASCII */
477 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
479 case 'g': /* Greek */
480 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
482 case 'b': /* Subscripts */
483 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
485 case 'p': /* Superscripts */
486 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
488 case '2': /* Basic Hebrew */
489 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
491 case 'N': /* Basic Cyrillic */
492 case 'Q': /* Extended Cyrillic */
493 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
495 case '3': /* Basic Arabic */
496 case '4': /* Extended Arabic */
497 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
499 case 'S': /* Greek */
500 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
502 case '1': /* Chinese, Japanese, Korean (EACC) */
503 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
507 cd->my_errno = YAZ_ICONV_EILSEQ;
510 *no_read += no_read_sub;
515 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
516 char **outbuf, size_t *outbytesleft,
519 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
522 size_t yaz_write_UTF8_char(unsigned long x,
523 char **outbuf, size_t *outbytesleft,
526 unsigned char *outp = (unsigned char *) *outbuf;
528 if (x <= 0x7f && *outbytesleft >= 1)
530 *outp++ = (unsigned char) x;
533 else if (x <= 0x7ff && *outbytesleft >= 2)
535 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
536 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537 (*outbytesleft) -= 2;
539 else if (x <= 0xffff && *outbytesleft >= 3)
541 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
542 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
543 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
544 (*outbytesleft) -= 3;
546 else if (x <= 0x1fffff && *outbytesleft >= 4)
548 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
549 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
550 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
551 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
552 (*outbytesleft) -= 4;
554 else if (x <= 0x3ffffff && *outbytesleft >= 5)
556 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
557 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
558 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
559 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
560 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
561 (*outbytesleft) -= 5;
563 else if (*outbytesleft >= 6)
565 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
566 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
567 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
568 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
569 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
570 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
571 (*outbytesleft) -= 6;
575 *error = YAZ_ICONV_E2BIG; /* not room for output */
578 *outbuf = (char *) outp;
583 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
584 char **outbuf, size_t *outbytesleft,
587 /* list of two char unicode sequence that, when combined, are
588 equivalent to single unicode chars that can be represented in
590 Regular iconv on Linux at least does not seem to convert these,
591 but since MARC-8 to UTF-8 generates these composed sequence
592 we get a better chance of a successful MARC-8 -> ISO-8859-1
594 unsigned char *outp = (unsigned char *) *outbuf;
596 if (cd->compose_char)
599 for (i = 0; latin1_comb[i].x1; i++)
600 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
602 x = latin1_comb[i].y;
605 if (*outbytesleft < 1)
606 { /* no room. Retain compose_char and bail out */
607 cd->my_errno = YAZ_ICONV_E2BIG;
610 if (!latin1_comb[i].x1)
611 { /* not found. Just write compose_char */
612 *outp++ = (unsigned char) cd->compose_char;
614 *outbuf = (char *) outp;
616 /* compose_char used so reset it. x now holds current char */
617 cd->compose_char = 0;
620 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
622 cd->compose_char = x;
625 else if (x > 255 || x < 1)
627 cd->my_errno = YAZ_ICONV_EILSEQ;
630 else if (*outbytesleft < 1)
632 cd->my_errno = YAZ_ICONV_E2BIG;
635 *outp++ = (unsigned char) x;
637 *outbuf = (char *) outp;
642 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
643 char **outbuf, size_t *outbytesleft,
646 unsigned char *outp = (unsigned char *) *outbuf;
647 if (*outbytesleft >= 4)
649 *outp++ = (unsigned char) (x>>24);
650 *outp++ = (unsigned char) (x>>16);
651 *outp++ = (unsigned char) (x>>8);
652 *outp++ = (unsigned char) x;
653 (*outbytesleft) -= 4;
657 cd->my_errno = YAZ_ICONV_E2BIG;
660 *outbuf = (char *) outp;
664 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
665 char **outbuf, size_t *outbytesleft,
668 unsigned char *outp = (unsigned char *) *outbuf;
669 if (*outbytesleft >= 4)
671 *outp++ = (unsigned char) x;
672 *outp++ = (unsigned char) (x>>8);
673 *outp++ = (unsigned char) (x>>16);
674 *outp++ = (unsigned char) (x>>24);
675 (*outbytesleft) -= 4;
679 cd->my_errno = YAZ_ICONV_E2BIG;
682 *outbuf = (char *) outp;
686 static unsigned long lookup_marc8(yaz_iconv_t cd,
687 unsigned long x, int *comb,
688 const char **page_chr)
691 char *utf8_outbuf = utf8_buf;
692 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
694 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
695 if (r == (size_t)(-1))
697 cd->my_errno = YAZ_ICONV_EILSEQ;
703 size_t inbytesleft, no_read_sub = 0;
707 inp = (unsigned char *) utf8_buf;
708 inbytesleft = strlen(utf8_buf);
710 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
713 *page_chr = "\033(B";
716 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
722 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
728 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
734 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
737 *page_chr = "\033(2";
740 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
743 *page_chr = "\033(N";
746 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
749 *page_chr = "\033(3";
752 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
755 *page_chr = "\033(S";
758 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
761 *page_chr = "\033(1";
764 cd->my_errno = YAZ_ICONV_EILSEQ;
769 static size_t flush_combos(yaz_iconv_t cd,
770 char **outbuf, size_t *outbytesleft)
772 unsigned long y = cd->write_marc8_last;
773 unsigned char byte, second_half = 0;
775 size_t i, out_no = 0;
780 byte = (unsigned char )((y>>16) & 0xff);
782 out_buf[out_no++] = byte;
783 byte = (unsigned char)((y>>8) & 0xff);
785 out_buf[out_no++] = byte;
786 byte = (unsigned char )(y & 0xff);
788 out_buf[out_no++] = byte;
790 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
792 cd->my_errno = YAZ_ICONV_E2BIG;
793 return (size_t) (-1);
796 for (i = 0; i < cd->write_marc8_comb_no; i++)
798 /* all MARC-8 combined characters are simple bytes */
799 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
802 else if (byte == 0xFA)
808 memcpy(*outbuf, out_buf, out_no);
810 (*outbytesleft) -= out_no;
813 *(*outbuf)++ = second_half;
817 cd->write_marc8_last = 0;
818 cd->write_marc8_comb_no = 0;
822 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
823 char **outbuf, size_t *outbytesleft,
827 const char *page_chr = 0;
828 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
831 return (size_t) (-1);
835 if (cd->write_marc8_comb_no < 6)
836 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
840 size_t r = flush_combos(cd, outbuf, outbytesleft);
841 const char *old_page_chr = cd->write_marc8_page_chr;
844 if (strcmp(page_chr, old_page_chr))
847 const char *page_out = page_chr;
849 if (*outbytesleft < 8)
851 cd->my_errno = YAZ_ICONV_E2BIG;
853 return (size_t) (-1);
855 cd->write_marc8_page_chr = page_chr;
857 if (!strcmp(old_page_chr, "\033p")
858 || !strcmp(old_page_chr, "\033g")
859 || !strcmp(old_page_chr, "\033b"))
861 /* Technique 1 leave */
863 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
865 /* Must leave script + enter new page */
866 plen = strlen(page_out);
867 memcpy(*outbuf, page_out, plen);
869 (*outbytesleft) -= plen;
873 plen = strlen(page_out);
874 memcpy(*outbuf, page_out, plen);
876 (*outbytesleft) -= plen;
878 cd->write_marc8_last = y;
882 size_t r = flush_combos(cd, outbuf, outbytesleft);
886 cd->write_marc8_comb_no--;
888 cd->write_marc8_last = 0;
895 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
896 char **outbuf, size_t *outbytesleft,
900 for (i = 0; latin1_comb[i].x1; i++)
902 if (x == latin1_comb[i].y)
905 /* save the output pointers .. */
906 char *outbuf0 = *outbuf;
907 size_t outbytesleft0 = *outbytesleft;
908 int last_ch = cd->write_marc8_last;
910 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
911 outbuf, outbytesleft, 0);
914 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
915 outbuf, outbytesleft, last);
916 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
918 /* not enough room. reset output to original values */
920 *outbytesleft = outbytesleft0;
921 cd->write_marc8_last = last_ch;
926 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
931 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
932 char **outbuf, size_t *outbytesleft,
935 unsigned char *outp = (unsigned char *) *outbuf;
937 if (*outbytesleft >= sizeof(wchar_t))
940 memcpy(outp, &wch, sizeof(wch));
942 (*outbytesleft) -= sizeof(wch);
946 cd->my_errno = YAZ_ICONV_E2BIG;
949 *outbuf = (char *) outp;
954 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
956 return cd->read_handle && cd->write_handle;
959 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
961 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
963 cd->write_handle = 0;
966 cd->my_errno = YAZ_ICONV_UNKNOWN;
968 /* a useful hack: if fromcode has leading @,
969 the library not use YAZ's own conversions .. */
970 if (fromcode[0] == '@')
974 if (!yaz_matchstr(fromcode, "UTF8"))
976 cd->read_handle = yaz_read_UTF8;
977 cd->init_handle = yaz_init_UTF8;
979 else if (!yaz_matchstr(fromcode, "ISO88591"))
980 cd->read_handle = yaz_read_ISO8859_1;
981 else if (!yaz_matchstr(fromcode, "UCS4"))
982 cd->read_handle = yaz_read_UCS4;
983 else if (!yaz_matchstr(fromcode, "UCS4LE"))
984 cd->read_handle = yaz_read_UCS4LE;
985 else if (!yaz_matchstr(fromcode, "MARC8"))
987 cd->read_handle = yaz_read_marc8;
988 cd->init_handle = yaz_init_marc8;
990 else if (!yaz_matchstr(fromcode, "MARC8s"))
992 cd->read_handle = yaz_read_marc8s;
993 cd->init_handle = yaz_init_marc8;
996 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
997 cd->read_handle = yaz_read_wchar_t;
1000 if (!yaz_matchstr(tocode, "UTF8"))
1001 cd->write_handle = yaz_write_UTF8;
1002 else if (!yaz_matchstr(tocode, "ISO88591"))
1003 cd->write_handle = yaz_write_ISO8859_1;
1004 else if (!yaz_matchstr (tocode, "UCS4"))
1005 cd->write_handle = yaz_write_UCS4;
1006 else if (!yaz_matchstr(tocode, "UCS4LE"))
1007 cd->write_handle = yaz_write_UCS4LE;
1008 else if (!yaz_matchstr(tocode, "MARC8"))
1010 cd->write_handle = yaz_write_marc8;
1011 cd->init_handle = yaz_init_marc8;
1013 else if (!yaz_matchstr(tocode, "MARC8s"))
1015 cd->write_handle = yaz_write_marc8;
1016 cd->init_handle = yaz_init_marc8;
1019 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1020 cd->write_handle = yaz_write_wchar_t;
1025 if (!cd->read_handle || !cd->write_handle)
1027 cd->iconv_cd = iconv_open (tocode, fromcode);
1028 if (cd->iconv_cd == (iconv_t) (-1))
1035 if (!cd->read_handle || !cd->write_handle)
1045 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1046 char **outbuf, size_t *outbytesleft)
1055 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1056 if (r == (size_t)(-1))
1058 switch (yaz_errno())
1061 cd->my_errno = YAZ_ICONV_E2BIG;
1064 cd->my_errno = YAZ_ICONV_EINVAL;
1067 cd->my_errno = YAZ_ICONV_EILSEQ;
1070 cd->my_errno = YAZ_ICONV_UNKNOWN;
1076 if (inbuf == 0 || *inbuf == 0)
1079 cd->my_errno = YAZ_ICONV_UNKNOWN;
1086 if (cd->init_handle)
1089 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1090 *inbytesleft, &no_read);
1093 if (cd->my_errno == YAZ_ICONV_EINVAL)
1098 *inbytesleft -= no_read;
1110 if (*inbytesleft == 0)
1112 r = *inbuf - inbuf0;
1117 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1128 no_read = cd->no_read_x;
1132 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1133 (*inbytesleft - no_read) == 0 ? 1 : 0);
1136 /* unable to write it. save it because read_handle cannot
1138 if (cd->my_errno == YAZ_ICONV_E2BIG)
1141 cd->no_read_x = no_read;
1147 *inbytesleft -= no_read;
1148 (*inbuf) += no_read;
1153 int yaz_iconv_error (yaz_iconv_t cd)
1155 return cd->my_errno;
1158 int yaz_iconv_close (yaz_iconv_t cd)
1162 iconv_close (cd->iconv_cd);
1171 * indent-tabs-mode: nil
1173 * vim: shiftwidth=4 tabstop=8 expandtab