2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.32 2007-01-03 08:42:15 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
92 unsigned long comb_x[8];
93 size_t comb_no_read[8];
95 unsigned long unget_x;
99 unsigned long compose_char;
101 unsigned long write_marc8_comb_ch[8];
102 size_t write_marc8_comb_no;
103 unsigned write_marc8_second_half_char;
104 unsigned long write_marc8_last;
105 const char *write_marc8_page_chr;
109 unsigned long x1, x2;
112 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
113 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
114 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
115 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
116 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
117 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
118 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
119 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
120 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
121 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
122 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
123 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
124 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
125 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
126 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
127 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
128 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
129 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
130 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
131 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
132 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
133 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
134 /* omitted: 0xd7 MULTIPLICATION SIGN */
135 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
136 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
137 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
138 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
139 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
140 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
141 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
142 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
143 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
144 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
145 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
146 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
147 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
148 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
149 /* omitted: 0xe6 LATIN SMALL LETTER AE */
150 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
151 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
152 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
153 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
154 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
155 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
156 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
157 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
158 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
159 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
160 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
161 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
162 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
163 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
164 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
165 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
166 /* omitted: 0xf7 DIVISION SIGN */
167 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
168 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
169 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
170 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
171 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
172 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
173 /* omitted: 0xfe LATIN SMALL LETTER THORN */
174 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
179 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
180 size_t inbytesleft, size_t *no_read)
182 unsigned long x = inp[0];
188 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
189 size_t inbytesleft, size_t *no_read)
198 cd->my_errno = YAZ_ICONV_EINVAL;
201 if (inp[1] != 0xbb && inp[2] == 0xbf)
208 unsigned long yaz_read_UTF8_char(unsigned char *inp,
209 size_t inbytesleft, size_t *no_read,
219 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
222 *error = YAZ_ICONV_EILSEQ;
224 else if (inp[0] <= 0xdf && inbytesleft >= 2)
226 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
232 *error = YAZ_ICONV_EILSEQ;
235 else if (inp[0] <= 0xef && inbytesleft >= 3)
237 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
244 *error = YAZ_ICONV_EILSEQ;
247 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
249 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
250 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
256 *error = YAZ_ICONV_EILSEQ;
259 else if (inp[0] <= 0xfb && inbytesleft >= 5)
261 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
262 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
269 *error = YAZ_ICONV_EILSEQ;
272 else if (inp[0] <= 0xfd && inbytesleft >= 6)
274 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
275 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
276 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
282 *error = YAZ_ICONV_EILSEQ;
288 *error = YAZ_ICONV_EINVAL;
293 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
294 size_t inbytesleft, size_t *no_read)
296 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
299 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
300 size_t inbytesleft, size_t *no_read)
306 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
311 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
317 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
318 size_t inbytesleft, size_t *no_read)
324 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
329 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
336 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
337 size_t inbytesleft, size_t *no_read)
341 if (inbytesleft < sizeof(wchar_t))
343 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
349 memcpy (&wch, inp, sizeof(wch));
351 *no_read = sizeof(wch);
358 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
359 size_t inbytesleft, size_t *no_read,
362 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
363 size_t inbytesleft, size_t *no_read)
366 if (cd->comb_offset < cd->comb_size)
368 *no_read = cd->comb_no_read[cd->comb_offset];
369 x = cd->comb_x[cd->comb_offset];
371 /* special case for double-diacritic combining characters,
372 INVERTED BREVE and DOUBLE TILDE.
373 We'll increment the no_read counter by 1, since we want to skip over
374 the processing of the closing ligature character
376 /* this code is no longer necessary.. our handlers code in
377 yaz_marc8_?_conv (generated by charconv.tcl) now returns
378 0 and no_read=1 when a sequence does not match the input.
379 The SECOND HALFs in codetables.xml produces a non-existant
380 entry in the conversion trie.. Hence when met, the input byte is
381 skipped as it should (in yaz_iconv)
384 if (x == 0x0361 || x == 0x0360)
392 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
395 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
398 cd->comb_x[cd->comb_size] = x;
399 cd->comb_no_read[cd->comb_size] = *no_read;
401 inbytesleft = inbytesleft - *no_read;
406 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
407 size_t inbytesleft, size_t *no_read)
409 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
410 if (x && cd->comb_size == 1)
412 /* For MARC8s we try to get a Latin-1 page code out of it */
414 for (i = 0; latin1_comb[i].x1; i++)
415 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
417 *no_read += cd->comb_no_read[0];
419 x = latin1_comb[i].y;
426 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
427 size_t inbytesleft, size_t *no_read,
431 while(inbytesleft >= 1 && inp[0] == 27)
433 size_t inbytesleft0 = inbytesleft;
436 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
441 if (inbytesleft <= 0)
444 cd->my_errno = YAZ_ICONV_EINVAL;
447 cd->marc8_esc_mode = *inp++;
449 (*no_read) += inbytesleft0 - inbytesleft;
451 if (inbytesleft <= 0)
456 size_t no_read_sub = 0;
459 switch(cd->marc8_esc_mode)
461 case 'B': /* Basic ASCII */
462 case 'E': /* ANSEL */
463 case 's': /* ASCII */
464 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
466 case 'g': /* Greek */
467 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
469 case 'b': /* Subscripts */
470 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
472 case 'p': /* Superscripts */
473 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
475 case '2': /* Basic Hebrew */
476 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
478 case 'N': /* Basic Cyrillic */
479 case 'Q': /* Extended Cyrillic */
480 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
482 case '3': /* Basic Arabic */
483 case '4': /* Extended Arabic */
484 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
486 case 'S': /* Greek */
487 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
489 case '1': /* Chinese, Japanese, Korean (EACC) */
490 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
494 cd->my_errno = YAZ_ICONV_EILSEQ;
497 *no_read += no_read_sub;
502 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
503 char **outbuf, size_t *outbytesleft,
506 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
509 size_t yaz_write_UTF8_char(unsigned long x,
510 char **outbuf, size_t *outbytesleft,
513 unsigned char *outp = (unsigned char *) *outbuf;
515 if (x <= 0x7f && *outbytesleft >= 1)
517 *outp++ = (unsigned char) x;
520 else if (x <= 0x7ff && *outbytesleft >= 2)
522 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
523 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
524 (*outbytesleft) -= 2;
526 else if (x <= 0xffff && *outbytesleft >= 3)
528 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
529 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
530 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
531 (*outbytesleft) -= 3;
533 else if (x <= 0x1fffff && *outbytesleft >= 4)
535 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
536 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
537 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
538 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
539 (*outbytesleft) -= 4;
541 else if (x <= 0x3ffffff && *outbytesleft >= 5)
543 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
544 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
545 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
546 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
547 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
548 (*outbytesleft) -= 5;
550 else if (*outbytesleft >= 6)
552 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
553 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
554 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
555 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
556 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
557 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
558 (*outbytesleft) -= 6;
562 *error = YAZ_ICONV_E2BIG; /* not room for output */
565 *outbuf = (char *) outp;
570 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
571 char **outbuf, size_t *outbytesleft,
574 /* list of two char unicode sequence that, when combined, are
575 equivalent to single unicode chars that can be represented in
577 Regular iconv on Linux at least does not seem to convert these,
578 but since MARC-8 to UTF-8 generates these composed sequence
579 we get a better chance of a successful MARC-8 -> ISO-8859-1
581 unsigned char *outp = (unsigned char *) *outbuf;
583 if (cd->compose_char)
586 for (i = 0; latin1_comb[i].x1; i++)
587 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
589 x = latin1_comb[i].y;
592 if (*outbytesleft < 1)
593 { /* no room. Retain compose_char and bail out */
594 cd->my_errno = YAZ_ICONV_E2BIG;
597 if (!latin1_comb[i].x1)
598 { /* not found. Just write compose_char */
599 *outp++ = (unsigned char) cd->compose_char;
601 *outbuf = (char *) outp;
603 /* compose_char used so reset it. x now holds current char */
604 cd->compose_char = 0;
607 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
609 cd->compose_char = x;
612 else if (x > 255 || x < 1)
614 cd->my_errno = YAZ_ICONV_EILSEQ;
617 else if (*outbytesleft < 1)
619 cd->my_errno = YAZ_ICONV_E2BIG;
622 *outp++ = (unsigned char) x;
624 *outbuf = (char *) outp;
629 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
630 char **outbuf, size_t *outbytesleft,
633 unsigned char *outp = (unsigned char *) *outbuf;
634 if (*outbytesleft >= 4)
636 *outp++ = (unsigned char) (x>>24);
637 *outp++ = (unsigned char) (x>>16);
638 *outp++ = (unsigned char) (x>>8);
639 *outp++ = (unsigned char) x;
640 (*outbytesleft) -= 4;
644 cd->my_errno = YAZ_ICONV_E2BIG;
647 *outbuf = (char *) outp;
651 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
652 char **outbuf, size_t *outbytesleft,
655 unsigned char *outp = (unsigned char *) *outbuf;
656 if (*outbytesleft >= 4)
658 *outp++ = (unsigned char) x;
659 *outp++ = (unsigned char) (x>>8);
660 *outp++ = (unsigned char) (x>>16);
661 *outp++ = (unsigned char) (x>>24);
662 (*outbytesleft) -= 4;
666 cd->my_errno = YAZ_ICONV_E2BIG;
669 *outbuf = (char *) outp;
673 static unsigned long lookup_marc8(yaz_iconv_t cd,
674 unsigned long x, int *comb,
675 const char **page_chr)
678 char *utf8_outbuf = utf8_buf;
679 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
681 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
682 if (r == (size_t)(-1))
684 cd->my_errno = YAZ_ICONV_EILSEQ;
690 size_t inbytesleft, no_read_sub = 0;
694 inp = (unsigned char *) utf8_buf;
695 inbytesleft = strlen(utf8_buf);
697 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
700 *page_chr = "\033(B";
703 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
709 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
715 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
721 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
724 *page_chr = "\033(2";
727 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
730 *page_chr = "\033(N";
733 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
736 *page_chr = "\033(3";
739 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
742 *page_chr = "\033(S";
745 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
748 *page_chr = "\033$1";
751 cd->my_errno = YAZ_ICONV_EILSEQ;
756 static size_t flush_combos(yaz_iconv_t cd,
757 char **outbuf, size_t *outbytesleft)
759 unsigned long y = cd->write_marc8_last;
762 size_t i, out_no = 0;
767 byte = (unsigned char )((y>>16) & 0xff);
769 out_buf[out_no++] = byte;
770 byte = (unsigned char)((y>>8) & 0xff);
772 out_buf[out_no++] = byte;
773 byte = (unsigned char )(y & 0xff);
775 out_buf[out_no++] = byte;
777 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
779 cd->my_errno = YAZ_ICONV_E2BIG;
780 return (size_t) (-1);
783 for (i = 0; i < cd->write_marc8_comb_no; i++)
785 /* all MARC-8 combined characters are simple bytes */
786 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
790 memcpy(*outbuf, out_buf, out_no);
792 (*outbytesleft) -= out_no;
793 if (cd->write_marc8_second_half_char)
795 *(*outbuf)++ = cd->write_marc8_second_half_char;
799 cd->write_marc8_last = 0;
800 cd->write_marc8_comb_no = 0;
801 cd->write_marc8_second_half_char = 0;
805 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
806 char **outbuf, size_t *outbytesleft,
810 const char *page_chr = 0;
811 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
814 return (size_t) (-1);
819 cd->write_marc8_second_half_char = 0xEC;
820 else if (x == 0x0360)
821 cd->write_marc8_second_half_char = 0xFB;
823 if (cd->write_marc8_comb_no < 6)
824 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
828 size_t r = flush_combos(cd, outbuf, outbytesleft);
829 const char *old_page_chr = cd->write_marc8_page_chr;
832 if (strcmp(page_chr, old_page_chr))
835 const char *page_out = page_chr;
837 if (*outbytesleft < 8)
839 cd->my_errno = YAZ_ICONV_E2BIG;
841 return (size_t) (-1);
843 cd->write_marc8_page_chr = page_chr;
845 if (!strcmp(old_page_chr, "\033p")
846 || !strcmp(old_page_chr, "\033g")
847 || !strcmp(old_page_chr, "\033b"))
849 /* Technique 1 leave */
851 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
853 /* Must leave script + enter new page */
854 plen = strlen(page_out);
855 memcpy(*outbuf, page_out, plen);
857 (*outbytesleft) -= plen;
861 plen = strlen(page_out);
862 memcpy(*outbuf, page_out, plen);
864 (*outbytesleft) -= plen;
866 cd->write_marc8_last = y;
870 size_t r = flush_combos(cd, outbuf, outbytesleft);
874 cd->write_marc8_comb_no--;
876 cd->write_marc8_last = 0;
883 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
884 char **outbuf, size_t *outbytesleft,
888 for (i = 0; latin1_comb[i].x1; i++)
890 if (x == latin1_comb[i].y)
893 /* save the output pointers .. */
894 char *outbuf0 = *outbuf;
895 size_t outbytesleft0 = *outbytesleft;
896 int last_ch = cd->write_marc8_last;
898 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
899 outbuf, outbytesleft, 0);
902 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
903 outbuf, outbytesleft, last);
904 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
906 /* not enough room. reset output to original values */
908 *outbytesleft = outbytesleft0;
909 cd->write_marc8_last = last_ch;
914 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
919 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
920 char **outbuf, size_t *outbytesleft,
923 unsigned char *outp = (unsigned char *) *outbuf;
925 if (*outbytesleft >= sizeof(wchar_t))
928 memcpy(outp, &wch, sizeof(wch));
930 (*outbytesleft) -= sizeof(wch);
934 cd->my_errno = YAZ_ICONV_E2BIG;
937 *outbuf = (char *) outp;
942 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
944 return cd->read_handle && cd->write_handle;
947 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
949 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
951 cd->write_handle = 0;
954 cd->my_errno = YAZ_ICONV_UNKNOWN;
956 /* a useful hack: if fromcode has leading @,
957 the library not use YAZ's own conversions .. */
958 if (fromcode[0] == '@')
962 if (!yaz_matchstr(fromcode, "UTF8"))
964 cd->read_handle = yaz_read_UTF8;
965 cd->init_handle = yaz_init_UTF8;
967 else if (!yaz_matchstr(fromcode, "ISO88591"))
968 cd->read_handle = yaz_read_ISO8859_1;
969 else if (!yaz_matchstr(fromcode, "UCS4"))
970 cd->read_handle = yaz_read_UCS4;
971 else if (!yaz_matchstr(fromcode, "UCS4LE"))
972 cd->read_handle = yaz_read_UCS4LE;
973 else if (!yaz_matchstr(fromcode, "MARC8"))
974 cd->read_handle = yaz_read_marc8;
975 else if (!yaz_matchstr(fromcode, "MARC8s"))
976 cd->read_handle = yaz_read_marc8s;
978 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
979 cd->read_handle = yaz_read_wchar_t;
982 if (!yaz_matchstr(tocode, "UTF8"))
983 cd->write_handle = yaz_write_UTF8;
984 else if (!yaz_matchstr(tocode, "ISO88591"))
985 cd->write_handle = yaz_write_ISO8859_1;
986 else if (!yaz_matchstr (tocode, "UCS4"))
987 cd->write_handle = yaz_write_UCS4;
988 else if (!yaz_matchstr(tocode, "UCS4LE"))
989 cd->write_handle = yaz_write_UCS4LE;
990 else if (!yaz_matchstr(tocode, "MARC8"))
991 cd->write_handle = yaz_write_marc8;
992 else if (!yaz_matchstr(tocode, "MARC8s"))
993 cd->write_handle = yaz_write_marc8;
995 else if (!yaz_matchstr(tocode, "WCHAR_T"))
996 cd->write_handle = yaz_write_wchar_t;
1001 if (!cd->read_handle || !cd->write_handle)
1003 cd->iconv_cd = iconv_open (tocode, fromcode);
1004 if (cd->iconv_cd == (iconv_t) (-1))
1011 if (!cd->read_handle || !cd->write_handle)
1021 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1022 char **outbuf, size_t *outbytesleft)
1031 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1032 if (r == (size_t)(-1))
1034 switch (yaz_errno())
1037 cd->my_errno = YAZ_ICONV_E2BIG;
1040 cd->my_errno = YAZ_ICONV_EINVAL;
1043 cd->my_errno = YAZ_ICONV_EILSEQ;
1046 cd->my_errno = YAZ_ICONV_UNKNOWN;
1052 if (inbuf == 0 || *inbuf == 0)
1055 cd->my_errno = YAZ_ICONV_UNKNOWN;
1062 if (cd->init_handle)
1065 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1066 *inbytesleft, &no_read);
1069 if (cd->my_errno == YAZ_ICONV_EINVAL)
1074 *inbytesleft -= no_read;
1077 cd->marc8_esc_mode = 'B';
1079 cd->comb_offset = cd->comb_size = 0;
1080 cd->compose_char = 0;
1082 cd->write_marc8_comb_no = 0;
1083 cd->write_marc8_second_half_char = 0;
1084 cd->write_marc8_last = 0;
1085 cd->write_marc8_page_chr = "\033(B";
1096 if (*inbytesleft == 0)
1098 r = *inbuf - inbuf0;
1103 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1114 no_read = cd->no_read_x;
1118 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1119 (*inbytesleft - no_read) == 0 ? 1 : 0);
1122 /* unable to write it. save it because read_handle cannot
1124 if (cd->my_errno == YAZ_ICONV_E2BIG)
1127 cd->no_read_x = no_read;
1133 *inbytesleft -= no_read;
1134 (*inbuf) += no_read;
1139 int yaz_iconv_error (yaz_iconv_t cd)
1141 return cd->my_errno;
1144 int yaz_iconv_close (yaz_iconv_t cd)
1148 iconv_close (cd->iconv_cd);
1157 * indent-tabs-mode: nil
1159 * vim: shiftwidth=4 tabstop=8 expandtab