2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.33 2007-01-18 14:45:05 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
88 size_t (*flush_handle)(yaz_iconv_t cd,
89 char **outbuf, size_t *outbytesleft);
94 unsigned long comb_x[8];
95 size_t comb_no_read[8];
97 unsigned long unget_x;
101 unsigned long compose_char;
103 unsigned long write_marc8_comb_ch[8];
104 size_t write_marc8_comb_no;
105 unsigned write_marc8_second_half_char;
106 unsigned long write_marc8_last;
107 const char *write_marc8_page_chr;
111 unsigned long x1, x2;
114 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
121 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136 /* omitted: 0xd7 MULTIPLICATION SIGN */
137 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
138 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
144 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
145 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151 /* omitted: 0xe6 LATIN SMALL LETTER AE */
152 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
162 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168 /* omitted: 0xf7 DIVISION SIGN */
169 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
170 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175 /* omitted: 0xfe LATIN SMALL LETTER THORN */
176 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
184 unsigned long x = inp[0];
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191 size_t inbytesleft, size_t *no_read)
200 cd->my_errno = YAZ_ICONV_EINVAL;
203 if (inp[1] != 0xbb && inp[2] == 0xbf)
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211 size_t inbytesleft, size_t *no_read,
221 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
224 *error = YAZ_ICONV_EILSEQ;
226 else if (inp[0] <= 0xdf && inbytesleft >= 2)
228 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
234 *error = YAZ_ICONV_EILSEQ;
237 else if (inp[0] <= 0xef && inbytesleft >= 3)
239 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
246 *error = YAZ_ICONV_EILSEQ;
249 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
251 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
258 *error = YAZ_ICONV_EILSEQ;
261 else if (inp[0] <= 0xfb && inbytesleft >= 5)
263 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
271 *error = YAZ_ICONV_EILSEQ;
274 else if (inp[0] <= 0xfd && inbytesleft >= 6)
276 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
284 *error = YAZ_ICONV_EILSEQ;
290 *error = YAZ_ICONV_EINVAL;
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296 size_t inbytesleft, size_t *no_read)
298 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302 size_t inbytesleft, size_t *no_read)
308 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
313 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320 size_t inbytesleft, size_t *no_read)
326 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
331 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339 size_t inbytesleft, size_t *no_read)
343 if (inbytesleft < sizeof(wchar_t))
345 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
351 memcpy (&wch, inp, sizeof(wch));
353 *no_read = sizeof(wch);
360 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
361 size_t inbytesleft, size_t *no_read,
364 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
365 size_t inbytesleft, size_t *no_read)
368 if (cd->comb_offset < cd->comb_size)
370 *no_read = cd->comb_no_read[cd->comb_offset];
371 x = cd->comb_x[cd->comb_offset];
373 /* special case for double-diacritic combining characters,
374 INVERTED BREVE and DOUBLE TILDE.
375 We'll increment the no_read counter by 1, since we want to skip over
376 the processing of the closing ligature character
378 /* this code is no longer necessary.. our handlers code in
379 yaz_marc8_?_conv (generated by charconv.tcl) now returns
380 0 and no_read=1 when a sequence does not match the input.
381 The SECOND HALFs in codetables.xml produces a non-existant
382 entry in the conversion trie.. Hence when met, the input byte is
383 skipped as it should (in yaz_iconv)
386 if (x == 0x0361 || x == 0x0360)
394 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
397 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
400 cd->comb_x[cd->comb_size] = x;
401 cd->comb_no_read[cd->comb_size] = *no_read;
403 inbytesleft = inbytesleft - *no_read;
408 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
409 size_t inbytesleft, size_t *no_read)
411 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
412 if (x && cd->comb_size == 1)
414 /* For MARC8s we try to get a Latin-1 page code out of it */
416 for (i = 0; latin1_comb[i].x1; i++)
417 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
419 *no_read += cd->comb_no_read[0];
421 x = latin1_comb[i].y;
428 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
429 size_t inbytesleft, size_t *no_read,
433 while(inbytesleft >= 1 && inp[0] == 27)
435 size_t inbytesleft0 = inbytesleft;
438 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
443 if (inbytesleft <= 0)
446 cd->my_errno = YAZ_ICONV_EINVAL;
449 cd->marc8_esc_mode = *inp++;
451 (*no_read) += inbytesleft0 - inbytesleft;
453 if (inbytesleft <= 0)
458 size_t no_read_sub = 0;
461 switch(cd->marc8_esc_mode)
463 case 'B': /* Basic ASCII */
464 case 'E': /* ANSEL */
465 case 's': /* ASCII */
466 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
468 case 'g': /* Greek */
469 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
471 case 'b': /* Subscripts */
472 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
474 case 'p': /* Superscripts */
475 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
477 case '2': /* Basic Hebrew */
478 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
480 case 'N': /* Basic Cyrillic */
481 case 'Q': /* Extended Cyrillic */
482 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
484 case '3': /* Basic Arabic */
485 case '4': /* Extended Arabic */
486 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
488 case 'S': /* Greek */
489 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
491 case '1': /* Chinese, Japanese, Korean (EACC) */
492 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
496 cd->my_errno = YAZ_ICONV_EILSEQ;
499 *no_read += no_read_sub;
504 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
505 char **outbuf, size_t *outbytesleft,
508 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
511 size_t yaz_write_UTF8_char(unsigned long x,
512 char **outbuf, size_t *outbytesleft,
515 unsigned char *outp = (unsigned char *) *outbuf;
517 if (x <= 0x7f && *outbytesleft >= 1)
519 *outp++ = (unsigned char) x;
522 else if (x <= 0x7ff && *outbytesleft >= 2)
524 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
525 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
526 (*outbytesleft) -= 2;
528 else if (x <= 0xffff && *outbytesleft >= 3)
530 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
531 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
532 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
533 (*outbytesleft) -= 3;
535 else if (x <= 0x1fffff && *outbytesleft >= 4)
537 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
538 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
539 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
540 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
541 (*outbytesleft) -= 4;
543 else if (x <= 0x3ffffff && *outbytesleft >= 5)
545 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
546 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
547 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
548 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
549 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
550 (*outbytesleft) -= 5;
552 else if (*outbytesleft >= 6)
554 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
555 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
556 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
557 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
558 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
559 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
560 (*outbytesleft) -= 6;
564 *error = YAZ_ICONV_E2BIG; /* not room for output */
567 *outbuf = (char *) outp;
572 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
573 char **outbuf, size_t *outbytesleft,
576 /* list of two char unicode sequence that, when combined, are
577 equivalent to single unicode chars that can be represented in
579 Regular iconv on Linux at least does not seem to convert these,
580 but since MARC-8 to UTF-8 generates these composed sequence
581 we get a better chance of a successful MARC-8 -> ISO-8859-1
583 unsigned char *outp = (unsigned char *) *outbuf;
585 if (cd->compose_char)
588 for (i = 0; latin1_comb[i].x1; i++)
589 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
591 x = latin1_comb[i].y;
594 if (*outbytesleft < 1)
595 { /* no room. Retain compose_char and bail out */
596 cd->my_errno = YAZ_ICONV_E2BIG;
599 if (!latin1_comb[i].x1)
600 { /* not found. Just write compose_char */
601 *outp++ = (unsigned char) cd->compose_char;
603 *outbuf = (char *) outp;
605 /* compose_char used so reset it. x now holds current char */
606 cd->compose_char = 0;
609 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
611 cd->compose_char = x;
614 else if (x > 255 || x < 1)
616 cd->my_errno = YAZ_ICONV_EILSEQ;
619 else if (*outbytesleft < 1)
621 cd->my_errno = YAZ_ICONV_E2BIG;
624 *outp++ = (unsigned char) x;
626 *outbuf = (char *) outp;
631 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
632 char **outbuf, size_t *outbytesleft,
635 unsigned char *outp = (unsigned char *) *outbuf;
636 if (*outbytesleft >= 4)
638 *outp++ = (unsigned char) (x>>24);
639 *outp++ = (unsigned char) (x>>16);
640 *outp++ = (unsigned char) (x>>8);
641 *outp++ = (unsigned char) x;
642 (*outbytesleft) -= 4;
646 cd->my_errno = YAZ_ICONV_E2BIG;
649 *outbuf = (char *) outp;
653 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
654 char **outbuf, size_t *outbytesleft,
657 unsigned char *outp = (unsigned char *) *outbuf;
658 if (*outbytesleft >= 4)
660 *outp++ = (unsigned char) x;
661 *outp++ = (unsigned char) (x>>8);
662 *outp++ = (unsigned char) (x>>16);
663 *outp++ = (unsigned char) (x>>24);
664 (*outbytesleft) -= 4;
668 cd->my_errno = YAZ_ICONV_E2BIG;
671 *outbuf = (char *) outp;
675 static unsigned long lookup_marc8(yaz_iconv_t cd,
676 unsigned long x, int *comb,
677 const char **page_chr)
680 char *utf8_outbuf = utf8_buf;
681 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
683 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
684 if (r == (size_t)(-1))
686 cd->my_errno = YAZ_ICONV_EILSEQ;
692 size_t inbytesleft, no_read_sub = 0;
696 inp = (unsigned char *) utf8_buf;
697 inbytesleft = strlen(utf8_buf);
699 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
702 *page_chr = "\033(B";
705 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
711 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
717 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
723 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
726 *page_chr = "\033(2";
729 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
732 *page_chr = "\033(N";
735 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
738 *page_chr = "\033(3";
741 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
744 *page_chr = "\033(S";
747 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
750 *page_chr = "\033$1";
753 cd->my_errno = YAZ_ICONV_EILSEQ;
758 static size_t flush_combos(yaz_iconv_t cd,
759 char **outbuf, size_t *outbytesleft)
761 unsigned long y = cd->write_marc8_last;
764 size_t i, out_no = 0;
769 byte = (unsigned char )((y>>16) & 0xff);
771 out_buf[out_no++] = byte;
772 byte = (unsigned char)((y>>8) & 0xff);
774 out_buf[out_no++] = byte;
775 byte = (unsigned char )(y & 0xff);
777 out_buf[out_no++] = byte;
779 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
781 cd->my_errno = YAZ_ICONV_E2BIG;
782 return (size_t) (-1);
785 for (i = 0; i < cd->write_marc8_comb_no; i++)
787 /* all MARC-8 combined characters are simple bytes */
788 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
792 memcpy(*outbuf, out_buf, out_no);
794 (*outbytesleft) -= out_no;
795 if (cd->write_marc8_second_half_char)
797 *(*outbuf)++ = cd->write_marc8_second_half_char;
801 cd->write_marc8_last = 0;
802 cd->write_marc8_comb_no = 0;
803 cd->write_marc8_second_half_char = 0;
807 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
808 char **outbuf, size_t *outbytesleft,
812 const char *page_chr = 0;
813 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
816 return (size_t) (-1);
821 cd->write_marc8_second_half_char = 0xEC;
822 else if (x == 0x0360)
823 cd->write_marc8_second_half_char = 0xFB;
825 if (cd->write_marc8_comb_no < 6)
826 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
830 size_t r = flush_combos(cd, outbuf, outbytesleft);
831 const char *old_page_chr = cd->write_marc8_page_chr;
834 if (strcmp(page_chr, old_page_chr))
837 const char *page_out = page_chr;
839 if (*outbytesleft < 8)
841 cd->my_errno = YAZ_ICONV_E2BIG;
843 return (size_t) (-1);
845 cd->write_marc8_page_chr = page_chr;
847 if (!strcmp(old_page_chr, "\033p")
848 || !strcmp(old_page_chr, "\033g")
849 || !strcmp(old_page_chr, "\033b"))
851 /* Technique 1 leave */
853 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
855 /* Must leave script + enter new page */
856 plen = strlen(page_out);
857 memcpy(*outbuf, page_out, plen);
859 (*outbytesleft) -= plen;
863 plen = strlen(page_out);
864 memcpy(*outbuf, page_out, plen);
866 (*outbytesleft) -= plen;
868 cd->write_marc8_last = y;
872 size_t r = flush_combos(cd, outbuf, outbytesleft);
876 cd->write_marc8_comb_no--;
878 cd->write_marc8_last = 0;
885 static size_t yaz_flush_marc8(yaz_iconv_t cd,
886 char **outbuf, size_t *outbytesleft)
888 if (strcmp(cd->write_marc8_page_chr, "\033(B"))
890 if (*outbytesleft < 3)
892 cd->my_errno = YAZ_ICONV_E2BIG;
893 return (size_t) (-1);
895 memcpy(*outbuf, "\033(B", 3);
902 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
903 char **outbuf, size_t *outbytesleft,
907 for (i = 0; latin1_comb[i].x1; i++)
909 if (x == latin1_comb[i].y)
912 /* save the output pointers .. */
913 char *outbuf0 = *outbuf;
914 size_t outbytesleft0 = *outbytesleft;
915 int last_ch = cd->write_marc8_last;
917 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
918 outbuf, outbytesleft, 0);
921 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
922 outbuf, outbytesleft, last);
923 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
925 /* not enough room. reset output to original values */
927 *outbytesleft = outbytesleft0;
928 cd->write_marc8_last = last_ch;
933 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
938 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
939 char **outbuf, size_t *outbytesleft,
942 unsigned char *outp = (unsigned char *) *outbuf;
944 if (*outbytesleft >= sizeof(wchar_t))
947 memcpy(outp, &wch, sizeof(wch));
949 (*outbytesleft) -= sizeof(wch);
953 cd->my_errno = YAZ_ICONV_E2BIG;
956 *outbuf = (char *) outp;
961 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
963 return cd->read_handle && cd->write_handle;
966 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
968 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
970 cd->write_handle = 0;
973 cd->flush_handle = 0;
974 cd->my_errno = YAZ_ICONV_UNKNOWN;
976 /* a useful hack: if fromcode has leading @,
977 the library not use YAZ's own conversions .. */
978 if (fromcode[0] == '@')
982 if (!yaz_matchstr(fromcode, "UTF8"))
984 cd->read_handle = yaz_read_UTF8;
985 cd->init_handle = yaz_init_UTF8;
987 else if (!yaz_matchstr(fromcode, "ISO88591"))
988 cd->read_handle = yaz_read_ISO8859_1;
989 else if (!yaz_matchstr(fromcode, "UCS4"))
990 cd->read_handle = yaz_read_UCS4;
991 else if (!yaz_matchstr(fromcode, "UCS4LE"))
992 cd->read_handle = yaz_read_UCS4LE;
993 else if (!yaz_matchstr(fromcode, "MARC8"))
994 cd->read_handle = yaz_read_marc8;
995 else if (!yaz_matchstr(fromcode, "MARC8s"))
996 cd->read_handle = yaz_read_marc8s;
998 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
999 cd->read_handle = yaz_read_wchar_t;
1002 if (!yaz_matchstr(tocode, "UTF8"))
1003 cd->write_handle = yaz_write_UTF8;
1004 else if (!yaz_matchstr(tocode, "ISO88591"))
1005 cd->write_handle = yaz_write_ISO8859_1;
1006 else if (!yaz_matchstr (tocode, "UCS4"))
1007 cd->write_handle = yaz_write_UCS4;
1008 else if (!yaz_matchstr(tocode, "UCS4LE"))
1009 cd->write_handle = yaz_write_UCS4LE;
1010 else if (!yaz_matchstr(tocode, "MARC8"))
1012 cd->write_handle = yaz_write_marc8;
1013 cd->flush_handle = yaz_flush_marc8;
1015 else if (!yaz_matchstr(tocode, "MARC8s"))
1017 cd->write_handle = yaz_write_marc8;
1018 cd->flush_handle = yaz_flush_marc8;
1021 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1022 cd->write_handle = yaz_write_wchar_t;
1027 if (!cd->read_handle || !cd->write_handle)
1029 cd->iconv_cd = iconv_open (tocode, fromcode);
1030 if (cd->iconv_cd == (iconv_t) (-1))
1037 if (!cd->read_handle || !cd->write_handle)
1047 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1048 char **outbuf, size_t *outbytesleft)
1057 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1058 if (r == (size_t)(-1))
1060 switch (yaz_errno())
1063 cd->my_errno = YAZ_ICONV_E2BIG;
1066 cd->my_errno = YAZ_ICONV_EINVAL;
1069 cd->my_errno = YAZ_ICONV_EILSEQ;
1072 cd->my_errno = YAZ_ICONV_UNKNOWN;
1084 cd->my_errno = YAZ_ICONV_UNKNOWN;
1085 cd->marc8_esc_mode = 'B';
1087 cd->comb_offset = cd->comb_size = 0;
1088 cd->compose_char = 0;
1090 cd->write_marc8_comb_no = 0;
1091 cd->write_marc8_second_half_char = 0;
1092 cd->write_marc8_last = 0;
1093 cd->write_marc8_page_chr = "\033(B";
1101 if (cd->init_handle && inbuf && *inbuf)
1104 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1105 *inbytesleft, &no_read);
1108 if (cd->my_errno == YAZ_ICONV_EINVAL)
1113 *inbytesleft -= no_read;
1127 no_read = cd->no_read_x;
1129 else if (inbuf && *inbuf)
1131 if (*inbytesleft == 0)
1133 r = *inbuf - inbuf0;
1136 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1147 if (cd->flush_handle && outbuf && *outbuf)
1148 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1155 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1156 (*inbytesleft - no_read) == 0 ? 1 : 0);
1159 /* unable to write it. save it because read_handle cannot
1161 if (cd->my_errno == YAZ_ICONV_E2BIG)
1164 cd->no_read_x = no_read;
1170 *inbytesleft -= no_read;
1171 (*inbuf) += no_read;
1176 int yaz_iconv_error (yaz_iconv_t cd)
1178 return cd->my_errno;
1181 int yaz_iconv_close (yaz_iconv_t cd)
1185 iconv_close (cd->iconv_cd);
1194 * indent-tabs-mode: nil
1196 * vim: shiftwidth=4 tabstop=8 expandtab