2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.36 2007-03-17 00:10:40 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
88 size_t (*flush_handle)(yaz_iconv_t cd,
89 char **outbuf, size_t *outbytesleft);
94 unsigned long comb_x[8];
95 size_t comb_no_read[8];
97 unsigned long unget_x;
101 unsigned long compose_char;
103 unsigned long write_marc8_comb_ch[8];
104 size_t write_marc8_comb_no;
105 unsigned write_marc8_second_half_char;
106 unsigned long write_marc8_last;
107 const char *write_marc8_page_chr;
111 unsigned long x1, x2;
114 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
121 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136 /* omitted: 0xd7 MULTIPLICATION SIGN */
137 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
138 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
144 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
145 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151 /* omitted: 0xe6 LATIN SMALL LETTER AE */
152 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
162 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168 /* omitted: 0xf7 DIVISION SIGN */
169 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
170 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175 /* omitted: 0xfe LATIN SMALL LETTER THORN */
176 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
184 unsigned long x = inp[0];
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191 size_t inbytesleft, size_t *no_read)
200 cd->my_errno = YAZ_ICONV_EINVAL;
203 if (inp[1] != 0xbb && inp[2] == 0xbf)
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211 size_t inbytesleft, size_t *no_read,
221 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
224 *error = YAZ_ICONV_EILSEQ;
226 else if (inp[0] <= 0xdf && inbytesleft >= 2)
228 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
234 *error = YAZ_ICONV_EILSEQ;
237 else if (inp[0] <= 0xef && inbytesleft >= 3)
239 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
246 *error = YAZ_ICONV_EILSEQ;
249 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
251 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
258 *error = YAZ_ICONV_EILSEQ;
261 else if (inp[0] <= 0xfb && inbytesleft >= 5)
263 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
271 *error = YAZ_ICONV_EILSEQ;
274 else if (inp[0] <= 0xfd && inbytesleft >= 6)
276 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
284 *error = YAZ_ICONV_EILSEQ;
290 *error = YAZ_ICONV_EINVAL;
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296 size_t inbytesleft, size_t *no_read)
298 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302 size_t inbytesleft, size_t *no_read)
308 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
313 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320 size_t inbytesleft, size_t *no_read)
326 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
331 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339 size_t inbytesleft, size_t *no_read)
343 if (inbytesleft < sizeof(wchar_t))
345 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
351 memcpy (&wch, inp, sizeof(wch));
353 *no_read = sizeof(wch);
359 static unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp,
360 size_t inbytesleft, size_t *no_read)
368 while (inbytesleft > 0)
374 else if (*inp == 0x9e)
378 else if (*inp == 0x9f)
388 if (inbytesleft == 0)
390 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
618 static size_t yaz_write_advancegreek(yaz_iconv_t cd, unsigned long x,
619 char **outbuf, size_t *outbytesleft,
623 unsigned char *out = (unsigned char*) *outbuf;
624 if (*outbytesleft < 3)
626 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
631 case 0x03ac : out[k++]=0x9d; out[k++]=0x81; break;
632 case 0x03ad : out[k++]=0x9d; out[k++]=0x85; break;
633 case 0x03ae : out[k++]=0x9d; out[k++]=0x87; break;
634 case 0x03af : out[k++]=0x9d; out[k++]=0x89; break;
635 case 0x03cc : out[k++]=0x9d; out[k++]=0x8f; break;
636 case 0x03cd : out[k++]=0x9d; out[k++]=0x95; break;
637 case 0x03ce : out[k++]=0x9d; out[k++]=0x99; break;
638 case 0x0390 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x89; break;
639 case 0x03b0 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x95; break;
640 case 0x0386 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x81; break;
641 case 0x0388 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x85; break;
642 case 0x0389 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x87; break;
643 case 0x038a : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x89; break;
644 case 0x038c : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x8f; break;
645 case 0x038e : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x95; break;
646 case 0x038f : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x99; break;
647 case 0x03ca : out[k++]=0x9e; out[k++]=0x89; break;
648 case 0x03cb : out[k++]=0x9e; out[k++]=0x95; break;
649 case 0x03aa : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x89; break;
650 case 0x03ab : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x95; break;
651 case 0x0391 : out[k++]=0x9f; out[k++]=0x81; break;
652 case 0x0392 : out[k++]=0x9f; out[k++]=0x82; break;
653 case 0x0393 : out[k++]=0x9f; out[k++]=0x83; break;
654 case 0x0394 : out[k++]=0x9f; out[k++]=0x84; break;
655 case 0x0395 : out[k++]=0x9f; out[k++]=0x85; break;
656 case 0x0396 : out[k++]=0x9f; out[k++]=0x86; break;
657 case 0x0397 : out[k++]=0x9f; out[k++]=0x87; break;
658 case 0x0398 : out[k++]=0x9f; out[k++]=0x88; break;
659 case 0x0399 : out[k++]=0x9f; out[k++]=0x89; break;
660 case 0x039a : out[k++]=0x9f; out[k++]=0x8a; break;
661 case 0x039b : out[k++]=0x9f; out[k++]=0x8b; break;
662 case 0x039c : out[k++]=0x9f; out[k++]=0x8c; break;
663 case 0x039d : out[k++]=0x9f; out[k++]=0x8d; break;
664 case 0x039e : out[k++]=0x9f; out[k++]=0x8e; break;
665 case 0x039f : out[k++]=0x9f; out[k++]=0x8f; break;
666 case 0x03a0 : out[k++]=0x9f; out[k++]=0x90; break;
667 case 0x03a1 : out[k++]=0x9f; out[k++]=0x91; break;
668 case 0x03a3 : out[k++]=0x9f; out[k++]=0x93; break;
669 case 0x03a4 : out[k++]=0x9f; out[k++]=0x94; break;
670 case 0x03a5 : out[k++]=0x9f; out[k++]=0x95; break;
671 case 0x03a6 : out[k++]=0x9f; out[k++]=0x96; break;
672 case 0x03a7 : out[k++]=0x9f; out[k++]=0x97; break;
673 case 0x03a8 : out[k++]=0x9f; out[k++]=0x98; break;
674 case 0x03a9 : out[k++]=0x9f; out[k++]=0x99; break;
675 case 0x03b1 : out[k++]=0x81; break;
676 case 0x03b2 : out[k++]=0x82; break;
677 case 0x03b3 : out[k++]=0x83; break;
678 case 0x03b4 : out[k++]=0x84; break;
679 case 0x03b5 : out[k++]=0x85; break;
680 case 0x03b6 : out[k++]=0x86; break;
681 case 0x03b7 : out[k++]=0x87; break;
682 case 0x03b8 : out[k++]=0x88; break;
683 case 0x03b9 : out[k++]=0x89; break;
684 case 0x03ba : out[k++]=0x8a; break;
685 case 0x03bb : out[k++]=0x8b; break;
686 case 0x03bc : out[k++]=0x8c; break;
687 case 0x03bd : out[k++]=0x8d; break;
688 case 0x03be : out[k++]=0x8e; break;
689 case 0x03bf : out[k++]=0x8f; break;
690 case 0x03c0 : out[k++]=0x90; break;
691 case 0x03c1 : out[k++]=0x91; break;
692 case 0x03c2 : out[k++]=0x92; break;
693 case 0x03c3 : out[k++]=0x93; break;
694 case 0x03c4 : out[k++]=0x94; break;
695 case 0x03c5 : out[k++]=0x95; break;
696 case 0x03c6 : out[k++]=0x96; break;
697 case 0x03c7 : out[k++]=0x96; break;
698 case 0x03c8 : out[k++]=0x98; break;
699 case 0x03c9 : out[k++]=0x99; break;
703 cd->my_errno = YAZ_ICONV_EILSEQ;
715 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
716 size_t inbytesleft, size_t *no_read,
719 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
720 size_t inbytesleft, size_t *no_read)
723 if (cd->comb_offset < cd->comb_size)
725 *no_read = cd->comb_no_read[cd->comb_offset];
726 x = cd->comb_x[cd->comb_offset];
728 /* special case for double-diacritic combining characters,
729 INVERTED BREVE and DOUBLE TILDE.
730 We'll increment the no_read counter by 1, since we want to skip over
731 the processing of the closing ligature character
733 /* this code is no longer necessary.. our handlers code in
734 yaz_marc8_?_conv (generated by charconv.tcl) now returns
735 0 and no_read=1 when a sequence does not match the input.
736 The SECOND HALFs in codetables.xml produces a non-existant
737 entry in the conversion trie.. Hence when met, the input byte is
738 skipped as it should (in yaz_iconv)
741 if (x == 0x0361 || x == 0x0360)
749 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
753 if (inbytesleft == 0 && cd->comb_size)
755 cd->my_errno = YAZ_ICONV_EINVAL;
760 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
763 cd->comb_x[cd->comb_size] = x;
764 cd->comb_no_read[cd->comb_size] = *no_read;
766 inbytesleft = inbytesleft - *no_read;
771 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
772 size_t inbytesleft, size_t *no_read)
774 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
775 if (x && cd->comb_size == 1)
777 /* For MARC8s we try to get a Latin-1 page code out of it */
779 for (i = 0; latin1_comb[i].x1; i++)
780 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
782 *no_read += cd->comb_no_read[0];
784 x = latin1_comb[i].y;
791 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
792 size_t inbytesleft, size_t *no_read,
796 while(inbytesleft >= 1 && inp[0] == 27)
798 size_t inbytesleft0 = inbytesleft;
801 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
806 if (inbytesleft <= 0)
809 cd->my_errno = YAZ_ICONV_EINVAL;
812 cd->marc8_esc_mode = *inp++;
814 (*no_read) += inbytesleft0 - inbytesleft;
816 if (inbytesleft <= 0)
821 size_t no_read_sub = 0;
824 switch(cd->marc8_esc_mode)
826 case 'B': /* Basic ASCII */
827 case 'E': /* ANSEL */
828 case 's': /* ASCII */
829 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
831 case 'g': /* Greek */
832 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
834 case 'b': /* Subscripts */
835 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
837 case 'p': /* Superscripts */
838 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
840 case '2': /* Basic Hebrew */
841 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
843 case 'N': /* Basic Cyrillic */
844 case 'Q': /* Extended Cyrillic */
845 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
847 case '3': /* Basic Arabic */
848 case '4': /* Extended Arabic */
849 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
851 case 'S': /* Greek */
852 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
854 case '1': /* Chinese, Japanese, Korean (EACC) */
855 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
859 cd->my_errno = YAZ_ICONV_EILSEQ;
862 *no_read += no_read_sub;
867 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
868 char **outbuf, size_t *outbytesleft,
871 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
874 size_t yaz_write_UTF8_char(unsigned long x,
875 char **outbuf, size_t *outbytesleft,
878 unsigned char *outp = (unsigned char *) *outbuf;
880 if (x <= 0x7f && *outbytesleft >= 1)
882 *outp++ = (unsigned char) x;
885 else if (x <= 0x7ff && *outbytesleft >= 2)
887 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
888 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
889 (*outbytesleft) -= 2;
891 else if (x <= 0xffff && *outbytesleft >= 3)
893 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
894 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
895 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
896 (*outbytesleft) -= 3;
898 else if (x <= 0x1fffff && *outbytesleft >= 4)
900 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
901 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
902 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
903 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
904 (*outbytesleft) -= 4;
906 else if (x <= 0x3ffffff && *outbytesleft >= 5)
908 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
909 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
910 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
911 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
912 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
913 (*outbytesleft) -= 5;
915 else if (*outbytesleft >= 6)
917 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
918 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
919 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
920 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
921 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
922 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
923 (*outbytesleft) -= 6;
927 *error = YAZ_ICONV_E2BIG; /* not room for output */
930 *outbuf = (char *) outp;
935 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
936 char **outbuf, size_t *outbytesleft,
939 /* list of two char unicode sequence that, when combined, are
940 equivalent to single unicode chars that can be represented in
942 Regular iconv on Linux at least does not seem to convert these,
943 but since MARC-8 to UTF-8 generates these composed sequence
944 we get a better chance of a successful MARC-8 -> ISO-8859-1
946 unsigned char *outp = (unsigned char *) *outbuf;
948 if (cd->compose_char)
951 for (i = 0; latin1_comb[i].x1; i++)
952 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
954 x = latin1_comb[i].y;
957 if (*outbytesleft < 1)
958 { /* no room. Retain compose_char and bail out */
959 cd->my_errno = YAZ_ICONV_E2BIG;
962 if (!latin1_comb[i].x1)
963 { /* not found. Just write compose_char */
964 *outp++ = (unsigned char) cd->compose_char;
966 *outbuf = (char *) outp;
968 /* compose_char used so reset it. x now holds current char */
969 cd->compose_char = 0;
972 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
974 cd->compose_char = x;
977 else if (x > 255 || x < 1)
979 cd->my_errno = YAZ_ICONV_EILSEQ;
982 else if (*outbytesleft < 1)
984 cd->my_errno = YAZ_ICONV_E2BIG;
987 *outp++ = (unsigned char) x;
989 *outbuf = (char *) outp;
994 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
995 char **outbuf, size_t *outbytesleft,
998 unsigned char *outp = (unsigned char *) *outbuf;
999 if (*outbytesleft >= 4)
1001 *outp++ = (unsigned char) (x>>24);
1002 *outp++ = (unsigned char) (x>>16);
1003 *outp++ = (unsigned char) (x>>8);
1004 *outp++ = (unsigned char) x;
1005 (*outbytesleft) -= 4;
1009 cd->my_errno = YAZ_ICONV_E2BIG;
1010 return (size_t)(-1);
1012 *outbuf = (char *) outp;
1016 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
1017 char **outbuf, size_t *outbytesleft,
1020 unsigned char *outp = (unsigned char *) *outbuf;
1021 if (*outbytesleft >= 4)
1023 *outp++ = (unsigned char) x;
1024 *outp++ = (unsigned char) (x>>8);
1025 *outp++ = (unsigned char) (x>>16);
1026 *outp++ = (unsigned char) (x>>24);
1027 (*outbytesleft) -= 4;
1031 cd->my_errno = YAZ_ICONV_E2BIG;
1032 return (size_t)(-1);
1034 *outbuf = (char *) outp;
1038 static unsigned long lookup_marc8(yaz_iconv_t cd,
1039 unsigned long x, int *comb,
1040 const char **page_chr)
1043 char *utf8_outbuf = utf8_buf;
1044 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
1046 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
1047 if (r == (size_t)(-1))
1049 cd->my_errno = YAZ_ICONV_EILSEQ;
1055 size_t inbytesleft, no_read_sub = 0;
1058 *utf8_outbuf = '\0';
1059 inp = (unsigned char *) utf8_buf;
1060 inbytesleft = strlen(utf8_buf);
1062 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
1065 *page_chr = "\033(B";
1068 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
1071 *page_chr = "\033g";
1074 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
1077 *page_chr = "\033b";
1080 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
1083 *page_chr = "\033p";
1086 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
1089 *page_chr = "\033(2";
1092 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
1095 *page_chr = "\033(N";
1098 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
1101 *page_chr = "\033(3";
1104 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
1107 *page_chr = "\033(S";
1110 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
1113 *page_chr = "\033$1";
1116 cd->my_errno = YAZ_ICONV_EILSEQ;
1121 static size_t flush_combos(yaz_iconv_t cd,
1122 char **outbuf, size_t *outbytesleft)
1124 unsigned long y = cd->write_marc8_last;
1127 size_t i, out_no = 0;
1132 byte = (unsigned char )((y>>16) & 0xff);
1134 out_buf[out_no++] = byte;
1135 byte = (unsigned char)((y>>8) & 0xff);
1137 out_buf[out_no++] = byte;
1138 byte = (unsigned char )(y & 0xff);
1140 out_buf[out_no++] = byte;
1142 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
1144 cd->my_errno = YAZ_ICONV_E2BIG;
1145 return (size_t) (-1);
1148 for (i = 0; i < cd->write_marc8_comb_no; i++)
1150 /* all MARC-8 combined characters are simple bytes */
1151 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
1152 *(*outbuf)++ = byte;
1155 memcpy(*outbuf, out_buf, out_no);
1157 (*outbytesleft) -= out_no;
1158 if (cd->write_marc8_second_half_char)
1160 *(*outbuf)++ = cd->write_marc8_second_half_char;
1164 cd->write_marc8_last = 0;
1165 cd->write_marc8_comb_no = 0;
1166 cd->write_marc8_second_half_char = 0;
1170 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
1171 char **outbuf, size_t *outbytesleft,
1172 const char *page_chr)
1174 const char *old_page_chr = cd->write_marc8_page_chr;
1175 if (strcmp(page_chr, old_page_chr))
1178 const char *page_out = page_chr;
1180 if (*outbytesleft < 8)
1182 cd->my_errno = YAZ_ICONV_E2BIG;
1184 return (size_t) (-1);
1186 cd->write_marc8_page_chr = page_chr;
1188 if (!strcmp(old_page_chr, "\033p")
1189 || !strcmp(old_page_chr, "\033g")
1190 || !strcmp(old_page_chr, "\033b"))
1192 /* Technique 1 leave */
1194 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
1196 /* Must leave script + enter new page */
1197 plen = strlen(page_out);
1198 memcpy(*outbuf, page_out, plen);
1200 (*outbytesleft) -= plen;
1201 page_out = page_chr;
1204 plen = strlen(page_out);
1205 memcpy(*outbuf, page_out, plen);
1207 (*outbytesleft) -= plen;
1213 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
1214 char **outbuf, size_t *outbytesleft,
1218 const char *page_chr = 0;
1219 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
1222 return (size_t) (-1);
1227 cd->write_marc8_second_half_char = 0xEC;
1228 else if (x == 0x0360)
1229 cd->write_marc8_second_half_char = 0xFB;
1231 if (cd->write_marc8_comb_no < 6)
1232 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
1236 size_t r = flush_combos(cd, outbuf, outbytesleft);
1240 r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr);
1243 cd->write_marc8_last = y;
1247 size_t r = flush_combos(cd, outbuf, outbytesleft);
1251 cd->write_marc8_comb_no--;
1253 cd->write_marc8_last = 0;
1260 static size_t yaz_flush_marc8(yaz_iconv_t cd,
1261 char **outbuf, size_t *outbytesleft)
1263 size_t r = flush_combos(cd, outbuf, outbytesleft);
1266 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, "\033(B");
1269 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
1270 char **outbuf, size_t *outbytesleft,
1274 for (i = 0; latin1_comb[i].x1; i++)
1276 if (x == latin1_comb[i].y)
1279 /* save the output pointers .. */
1280 char *outbuf0 = *outbuf;
1281 size_t outbytesleft0 = *outbytesleft;
1282 int last_ch = cd->write_marc8_last;
1284 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
1285 outbuf, outbytesleft, 0);
1288 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
1289 outbuf, outbytesleft, last);
1290 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
1292 /* not enough room. reset output to original values */
1294 *outbytesleft = outbytesleft0;
1295 cd->write_marc8_last = last_ch;
1300 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
1305 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
1306 char **outbuf, size_t *outbytesleft,
1309 unsigned char *outp = (unsigned char *) *outbuf;
1311 if (*outbytesleft >= sizeof(wchar_t))
1314 memcpy(outp, &wch, sizeof(wch));
1315 outp += sizeof(wch);
1316 (*outbytesleft) -= sizeof(wch);
1320 cd->my_errno = YAZ_ICONV_E2BIG;
1321 return (size_t)(-1);
1323 *outbuf = (char *) outp;
1328 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
1330 return cd->read_handle && cd->write_handle;
1333 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
1335 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
1337 cd->write_handle = 0;
1338 cd->read_handle = 0;
1339 cd->init_handle = 0;
1340 cd->flush_handle = 0;
1341 cd->my_errno = YAZ_ICONV_UNKNOWN;
1343 /* a useful hack: if fromcode has leading @,
1344 the library not use YAZ's own conversions .. */
1345 if (fromcode[0] == '@')
1349 if (!yaz_matchstr(fromcode, "UTF8"))
1351 cd->read_handle = yaz_read_UTF8;
1352 cd->init_handle = yaz_init_UTF8;
1354 else if (!yaz_matchstr(fromcode, "ISO88591"))
1355 cd->read_handle = yaz_read_ISO8859_1;
1356 else if (!yaz_matchstr(fromcode, "UCS4"))
1357 cd->read_handle = yaz_read_UCS4;
1358 else if (!yaz_matchstr(fromcode, "UCS4LE"))
1359 cd->read_handle = yaz_read_UCS4LE;
1360 else if (!yaz_matchstr(fromcode, "MARC8"))
1361 cd->read_handle = yaz_read_marc8;
1362 else if (!yaz_matchstr(fromcode, "MARC8s"))
1363 cd->read_handle = yaz_read_marc8s;
1364 else if (!yaz_matchstr(fromcode, "advancegreek"))
1365 cd->read_handle = yaz_read_advancegreek;
1367 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1368 cd->read_handle = yaz_read_wchar_t;
1371 if (!yaz_matchstr(tocode, "UTF8"))
1372 cd->write_handle = yaz_write_UTF8;
1373 else if (!yaz_matchstr(tocode, "ISO88591"))
1374 cd->write_handle = yaz_write_ISO8859_1;
1375 else if (!yaz_matchstr (tocode, "UCS4"))
1376 cd->write_handle = yaz_write_UCS4;
1377 else if (!yaz_matchstr(tocode, "UCS4LE"))
1378 cd->write_handle = yaz_write_UCS4LE;
1379 else if (!yaz_matchstr(tocode, "MARC8"))
1381 cd->write_handle = yaz_write_marc8;
1382 cd->flush_handle = yaz_flush_marc8;
1384 else if (!yaz_matchstr(tocode, "MARC8s"))
1386 cd->write_handle = yaz_write_marc8;
1387 cd->flush_handle = yaz_flush_marc8;
1389 else if (!yaz_matchstr(tocode, "advancegreek"))
1391 cd->write_handle = yaz_write_advancegreek;
1394 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1395 cd->write_handle = yaz_write_wchar_t;
1400 if (!cd->read_handle || !cd->write_handle)
1402 cd->iconv_cd = iconv_open (tocode, fromcode);
1403 if (cd->iconv_cd == (iconv_t) (-1))
1410 if (!cd->read_handle || !cd->write_handle)
1420 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1421 char **outbuf, size_t *outbytesleft)
1430 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1431 if (r == (size_t)(-1))
1433 switch (yaz_errno())
1436 cd->my_errno = YAZ_ICONV_E2BIG;
1439 cd->my_errno = YAZ_ICONV_EINVAL;
1442 cd->my_errno = YAZ_ICONV_EILSEQ;
1445 cd->my_errno = YAZ_ICONV_UNKNOWN;
1457 cd->my_errno = YAZ_ICONV_UNKNOWN;
1458 cd->marc8_esc_mode = 'B';
1460 cd->comb_offset = cd->comb_size = 0;
1461 cd->compose_char = 0;
1463 cd->write_marc8_comb_no = 0;
1464 cd->write_marc8_second_half_char = 0;
1465 cd->write_marc8_last = 0;
1466 cd->write_marc8_page_chr = "\033(B";
1474 if (cd->init_handle && inbuf && *inbuf)
1477 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1478 *inbytesleft, &no_read);
1481 if (cd->my_errno == YAZ_ICONV_EINVAL)
1486 *inbytesleft -= no_read;
1500 no_read = cd->no_read_x;
1502 else if (inbuf && *inbuf)
1504 if (*inbytesleft == 0)
1506 r = *inbuf - inbuf0;
1509 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1520 if (cd->flush_handle && outbuf && *outbuf)
1521 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1528 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1529 (*inbytesleft - no_read) == 0 ? 1 : 0);
1532 /* unable to write it. save it because read_handle cannot
1534 if (cd->my_errno == YAZ_ICONV_E2BIG)
1537 cd->no_read_x = no_read;
1543 *inbytesleft -= no_read;
1544 (*inbuf) += no_read;
1549 int yaz_iconv_error (yaz_iconv_t cd)
1551 return cd->my_errno;
1554 int yaz_iconv_close (yaz_iconv_t cd)
1558 iconv_close (cd->iconv_cd);
1567 * indent-tabs-mode: nil
1569 * vim: shiftwidth=4 tabstop=8 expandtab