2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.29 2006-08-31 18:19:53 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
92 unsigned long comb_x[8];
93 size_t comb_no_read[8];
95 unsigned long unget_x;
99 unsigned long compose_char;
101 unsigned long write_marc8_comb_ch[8];
102 size_t write_marc8_comb_no;
103 unsigned long write_marc8_last;
104 const char *write_marc8_page_chr;
108 unsigned long x1, x2;
111 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
118 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133 /* omitted: 0xd7 MULTIPLICATION SIGN */
134 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
135 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
141 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
142 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148 /* omitted: 0xe6 LATIN SMALL LETTER AE */
149 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
159 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165 /* omitted: 0xf7 DIVISION SIGN */
166 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
167 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172 /* omitted: 0xfe LATIN SMALL LETTER THORN */
173 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179 size_t inbytesleft, size_t *no_read)
181 unsigned long x = inp[0];
187 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
188 size_t inbytesleft, size_t *no_read)
197 cd->my_errno = YAZ_ICONV_EINVAL;
200 if (inp[1] != 0xbb && inp[2] == 0xbf)
207 unsigned long yaz_read_UTF8_char(unsigned char *inp,
208 size_t inbytesleft, size_t *no_read,
218 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
221 *error = YAZ_ICONV_EILSEQ;
223 else if (inp[0] <= 0xdf && inbytesleft >= 2)
225 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
231 *error = YAZ_ICONV_EILSEQ;
234 else if (inp[0] <= 0xef && inbytesleft >= 3)
236 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
243 *error = YAZ_ICONV_EILSEQ;
246 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
248 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
249 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
255 *error = YAZ_ICONV_EILSEQ;
258 else if (inp[0] <= 0xfb && inbytesleft >= 5)
260 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
261 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
268 *error = YAZ_ICONV_EILSEQ;
271 else if (inp[0] <= 0xfd && inbytesleft >= 6)
273 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
274 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
275 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
281 *error = YAZ_ICONV_EILSEQ;
287 *error = YAZ_ICONV_EINVAL;
292 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
293 size_t inbytesleft, size_t *no_read)
295 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
298 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
299 size_t inbytesleft, size_t *no_read)
305 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
310 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
316 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
317 size_t inbytesleft, size_t *no_read)
323 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
328 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
335 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
336 size_t inbytesleft, size_t *no_read)
340 if (inbytesleft < sizeof(wchar_t))
342 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
348 memcpy (&wch, inp, sizeof(wch));
350 *no_read = sizeof(wch);
357 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
358 size_t inbytesleft, size_t *no_read,
361 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
362 size_t inbytesleft, size_t *no_read)
365 if (cd->comb_offset < cd->comb_size)
367 *no_read = cd->comb_no_read[cd->comb_offset];
368 x = cd->comb_x[cd->comb_offset];
370 /* special case for double-diacritic combining characters,
371 INVERTED BREVE and DOUBLE TILDE.
372 We'll increment the no_read counter by 1, since we want to skip over
373 the processing of the closing ligature character
375 /* this code is no longer necessary.. our handlers code in
376 yaz_marc8_?_conv (generated by charconv.tcl) now returns
377 0 and no_read=1 when a sequence does not match the input.
378 The SECOND HALFs in codetables.xml produces a non-existant
379 entry in the conversion trie.. Hence when met, the input byte is
380 skipped as it should (in yaz_iconv)
383 if (x == 0x0361 || x == 0x0360)
391 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
394 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
397 cd->comb_x[cd->comb_size] = x;
398 cd->comb_no_read[cd->comb_size] = *no_read;
400 inbytesleft = inbytesleft - *no_read;
405 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
406 size_t inbytesleft, size_t *no_read)
408 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
409 if (x && cd->comb_size == 1)
411 /* For MARC8s we try to get a Latin-1 page code out of it */
413 for (i = 0; latin1_comb[i].x1; i++)
414 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
416 *no_read += cd->comb_no_read[0];
418 x = latin1_comb[i].y;
425 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
426 size_t inbytesleft, size_t *no_read,
430 while(inbytesleft >= 1 && inp[0] == 27)
432 size_t inbytesleft0 = inbytesleft;
435 while(inbytesleft > 0 && strchr("(,$!", *inp))
440 if (inbytesleft <= 0)
443 cd->my_errno = YAZ_ICONV_EINVAL;
446 cd->marc8_esc_mode = *inp++;
448 (*no_read) += inbytesleft0 - inbytesleft;
450 if (inbytesleft <= 0)
455 size_t no_read_sub = 0;
458 switch(cd->marc8_esc_mode)
460 case 'B': /* Basic ASCII */
461 case 'E': /* ANSEL */
462 case 's': /* ASCII */
463 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
465 case 'g': /* Greek */
466 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
468 case 'b': /* Subscripts */
469 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
471 case 'p': /* Superscripts */
472 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
474 case '2': /* Basic Hebrew */
475 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
477 case 'N': /* Basic Cyrillic */
478 case 'Q': /* Extended Cyrillic */
479 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
481 case '3': /* Basic Arabic */
482 case '4': /* Extended Arabic */
483 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
485 case 'S': /* Greek */
486 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
488 case '1': /* Chinese, Japanese, Korean (EACC) */
489 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
493 cd->my_errno = YAZ_ICONV_EILSEQ;
496 *no_read += no_read_sub;
501 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
502 char **outbuf, size_t *outbytesleft,
505 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
508 size_t yaz_write_UTF8_char(unsigned long x,
509 char **outbuf, size_t *outbytesleft,
512 unsigned char *outp = (unsigned char *) *outbuf;
514 if (x <= 0x7f && *outbytesleft >= 1)
516 *outp++ = (unsigned char) x;
519 else if (x <= 0x7ff && *outbytesleft >= 2)
521 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
522 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
523 (*outbytesleft) -= 2;
525 else if (x <= 0xffff && *outbytesleft >= 3)
527 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
528 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
529 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
530 (*outbytesleft) -= 3;
532 else if (x <= 0x1fffff && *outbytesleft >= 4)
534 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
535 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
536 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
537 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
538 (*outbytesleft) -= 4;
540 else if (x <= 0x3ffffff && *outbytesleft >= 5)
542 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
543 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
544 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
545 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
546 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
547 (*outbytesleft) -= 5;
549 else if (*outbytesleft >= 6)
551 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
552 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
553 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
554 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
555 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
556 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
557 (*outbytesleft) -= 6;
561 *error = YAZ_ICONV_E2BIG; /* not room for output */
564 *outbuf = (char *) outp;
569 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
570 char **outbuf, size_t *outbytesleft,
573 /* list of two char unicode sequence that, when combined, are
574 equivalent to single unicode chars that can be represented in
576 Regular iconv on Linux at least does not seem to convert these,
577 but since MARC-8 to UTF-8 generates these composed sequence
578 we get a better chance of a successful MARC-8 -> ISO-8859-1
580 unsigned char *outp = (unsigned char *) *outbuf;
582 if (cd->compose_char)
585 for (i = 0; latin1_comb[i].x1; i++)
586 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
588 x = latin1_comb[i].y;
591 if (*outbytesleft < 1)
592 { /* no room. Retain compose_char and bail out */
593 cd->my_errno = YAZ_ICONV_E2BIG;
596 if (!latin1_comb[i].x1)
597 { /* not found. Just write compose_char */
598 *outp++ = (unsigned char) cd->compose_char;
600 *outbuf = (char *) outp;
602 /* compose_char used so reset it. x now holds current char */
603 cd->compose_char = 0;
606 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
608 cd->compose_char = x;
611 else if (x > 255 || x < 1)
613 cd->my_errno = YAZ_ICONV_EILSEQ;
616 else if (*outbytesleft < 1)
618 cd->my_errno = YAZ_ICONV_E2BIG;
621 *outp++ = (unsigned char) x;
623 *outbuf = (char *) outp;
628 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
629 char **outbuf, size_t *outbytesleft,
632 unsigned char *outp = (unsigned char *) *outbuf;
633 if (*outbytesleft >= 4)
635 *outp++ = (unsigned char) (x>>24);
636 *outp++ = (unsigned char) (x>>16);
637 *outp++ = (unsigned char) (x>>8);
638 *outp++ = (unsigned char) x;
639 (*outbytesleft) -= 4;
643 cd->my_errno = YAZ_ICONV_E2BIG;
646 *outbuf = (char *) outp;
650 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
651 char **outbuf, size_t *outbytesleft,
654 unsigned char *outp = (unsigned char *) *outbuf;
655 if (*outbytesleft >= 4)
657 *outp++ = (unsigned char) x;
658 *outp++ = (unsigned char) (x>>8);
659 *outp++ = (unsigned char) (x>>16);
660 *outp++ = (unsigned char) (x>>24);
661 (*outbytesleft) -= 4;
665 cd->my_errno = YAZ_ICONV_E2BIG;
668 *outbuf = (char *) outp;
672 static unsigned long lookup_marc8(yaz_iconv_t cd,
673 unsigned long x, int *comb,
674 const char **page_chr)
677 char *utf8_outbuf = utf8_buf;
678 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
680 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
681 if (r == (size_t)(-1))
683 cd->my_errno = YAZ_ICONV_EILSEQ;
689 size_t inbytesleft, no_read_sub = 0;
693 inp = (unsigned char *) utf8_buf;
694 inbytesleft = strlen(utf8_buf);
696 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
699 *page_chr = "\033(B";
702 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
708 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
714 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
720 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
723 *page_chr = "\033(2";
726 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
729 *page_chr = "\033(N";
732 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
735 *page_chr = "\033(3";
738 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
741 *page_chr = "\033(S";
744 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
747 *page_chr = "\033$1";
750 cd->my_errno = YAZ_ICONV_EILSEQ;
755 static size_t flush_combos(yaz_iconv_t cd,
756 char **outbuf, size_t *outbytesleft)
758 unsigned long y = cd->write_marc8_last;
759 unsigned char byte, second_half = 0;
761 size_t i, out_no = 0;
766 byte = (unsigned char )((y>>16) & 0xff);
768 out_buf[out_no++] = byte;
769 byte = (unsigned char)((y>>8) & 0xff);
771 out_buf[out_no++] = byte;
772 byte = (unsigned char )(y & 0xff);
774 out_buf[out_no++] = byte;
776 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
778 cd->my_errno = YAZ_ICONV_E2BIG;
779 return (size_t) (-1);
782 for (i = 0; i < cd->write_marc8_comb_no; i++)
784 /* all MARC-8 combined characters are simple bytes */
785 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
788 else if (byte == 0xFA)
794 memcpy(*outbuf, out_buf, out_no);
796 (*outbytesleft) -= out_no;
799 *(*outbuf)++ = second_half;
803 cd->write_marc8_last = 0;
804 cd->write_marc8_comb_no = 0;
808 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
809 char **outbuf, size_t *outbytesleft,
813 const char *page_chr = 0;
814 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
817 return (size_t) (-1);
821 if (cd->write_marc8_comb_no < 6)
822 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
826 size_t r = flush_combos(cd, outbuf, outbytesleft);
827 const char *old_page_chr = cd->write_marc8_page_chr;
830 if (strcmp(page_chr, old_page_chr))
833 const char *page_out = page_chr;
835 if (*outbytesleft < 8)
837 cd->my_errno = YAZ_ICONV_E2BIG;
839 return (size_t) (-1);
841 cd->write_marc8_page_chr = page_chr;
843 if (!strcmp(old_page_chr, "\033p")
844 || !strcmp(old_page_chr, "\033g")
845 || !strcmp(old_page_chr, "\033b"))
847 /* Technique 1 leave */
849 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
851 /* Must leave script + enter new page */
852 plen = strlen(page_out);
853 memcpy(*outbuf, page_out, plen);
855 (*outbytesleft) -= plen;
859 plen = strlen(page_out);
860 memcpy(*outbuf, page_out, plen);
862 (*outbytesleft) -= plen;
864 cd->write_marc8_last = y;
868 size_t r = flush_combos(cd, outbuf, outbytesleft);
872 cd->write_marc8_comb_no--;
874 cd->write_marc8_last = 0;
881 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
882 char **outbuf, size_t *outbytesleft,
886 for (i = 0; latin1_comb[i].x1; i++)
888 if (x == latin1_comb[i].y)
891 /* save the output pointers .. */
892 char *outbuf0 = *outbuf;
893 size_t outbytesleft0 = *outbytesleft;
894 int last_ch = cd->write_marc8_last;
896 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
897 outbuf, outbytesleft, 0);
900 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
901 outbuf, outbytesleft, last);
902 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
904 /* not enough room. reset output to original values */
906 *outbytesleft = outbytesleft0;
907 cd->write_marc8_last = last_ch;
912 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
917 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
918 char **outbuf, size_t *outbytesleft,
921 unsigned char *outp = (unsigned char *) *outbuf;
923 if (*outbytesleft >= sizeof(wchar_t))
926 memcpy(outp, &wch, sizeof(wch));
928 (*outbytesleft) -= sizeof(wch);
932 cd->my_errno = YAZ_ICONV_E2BIG;
935 *outbuf = (char *) outp;
940 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
942 return cd->read_handle && cd->write_handle;
945 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
947 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
949 cd->write_handle = 0;
952 cd->my_errno = YAZ_ICONV_UNKNOWN;
954 /* a useful hack: if fromcode has leading @,
955 the library not use YAZ's own conversions .. */
956 if (fromcode[0] == '@')
960 if (!yaz_matchstr(fromcode, "UTF8"))
962 cd->read_handle = yaz_read_UTF8;
963 cd->init_handle = yaz_init_UTF8;
965 else if (!yaz_matchstr(fromcode, "ISO88591"))
966 cd->read_handle = yaz_read_ISO8859_1;
967 else if (!yaz_matchstr(fromcode, "UCS4"))
968 cd->read_handle = yaz_read_UCS4;
969 else if (!yaz_matchstr(fromcode, "UCS4LE"))
970 cd->read_handle = yaz_read_UCS4LE;
971 else if (!yaz_matchstr(fromcode, "MARC8"))
972 cd->read_handle = yaz_read_marc8;
973 else if (!yaz_matchstr(fromcode, "MARC8s"))
974 cd->read_handle = yaz_read_marc8s;
976 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
977 cd->read_handle = yaz_read_wchar_t;
980 if (!yaz_matchstr(tocode, "UTF8"))
981 cd->write_handle = yaz_write_UTF8;
982 else if (!yaz_matchstr(tocode, "ISO88591"))
983 cd->write_handle = yaz_write_ISO8859_1;
984 else if (!yaz_matchstr (tocode, "UCS4"))
985 cd->write_handle = yaz_write_UCS4;
986 else if (!yaz_matchstr(tocode, "UCS4LE"))
987 cd->write_handle = yaz_write_UCS4LE;
988 else if (!yaz_matchstr(tocode, "MARC8"))
989 cd->write_handle = yaz_write_marc8;
990 else if (!yaz_matchstr(tocode, "MARC8s"))
991 cd->write_handle = yaz_write_marc8;
993 else if (!yaz_matchstr(tocode, "WCHAR_T"))
994 cd->write_handle = yaz_write_wchar_t;
999 if (!cd->read_handle || !cd->write_handle)
1001 cd->iconv_cd = iconv_open (tocode, fromcode);
1002 if (cd->iconv_cd == (iconv_t) (-1))
1009 if (!cd->read_handle || !cd->write_handle)
1019 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1020 char **outbuf, size_t *outbytesleft)
1029 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1030 if (r == (size_t)(-1))
1032 switch (yaz_errno())
1035 cd->my_errno = YAZ_ICONV_E2BIG;
1038 cd->my_errno = YAZ_ICONV_EINVAL;
1041 cd->my_errno = YAZ_ICONV_EILSEQ;
1044 cd->my_errno = YAZ_ICONV_UNKNOWN;
1050 if (inbuf == 0 || *inbuf == 0)
1053 cd->my_errno = YAZ_ICONV_UNKNOWN;
1060 if (cd->init_handle)
1063 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1064 *inbytesleft, &no_read);
1067 if (cd->my_errno == YAZ_ICONV_EINVAL)
1072 *inbytesleft -= no_read;
1075 cd->marc8_esc_mode = 'B';
1077 cd->comb_offset = cd->comb_size = 0;
1078 cd->compose_char = 0;
1080 cd->write_marc8_comb_no = 0;
1081 cd->write_marc8_last = 0;
1082 cd->write_marc8_page_chr = "\033(B";
1093 if (*inbytesleft == 0)
1095 r = *inbuf - inbuf0;
1100 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1111 no_read = cd->no_read_x;
1115 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1116 (*inbytesleft - no_read) == 0 ? 1 : 0);
1119 /* unable to write it. save it because read_handle cannot
1121 if (cd->my_errno == YAZ_ICONV_E2BIG)
1124 cd->no_read_x = no_read;
1130 *inbytesleft -= no_read;
1131 (*inbuf) += no_read;
1136 int yaz_iconv_error (yaz_iconv_t cd)
1138 return cd->my_errno;
1141 int yaz_iconv_close (yaz_iconv_t cd)
1145 iconv_close (cd->iconv_cd);
1154 * indent-tabs-mode: nil
1156 * vim: shiftwidth=4 tabstop=8 expandtab