2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.37 2007-03-20 21:37:32 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft);
87 size_t (*flush_handle)(yaz_iconv_t cd,
88 char **outbuf, size_t *outbytesleft);
93 unsigned long comb_x[8];
94 size_t comb_no_read[8];
96 unsigned long unget_x;
100 unsigned long compose_char;
102 unsigned long write_marc8_comb_ch[8];
103 size_t write_marc8_comb_no;
104 unsigned write_marc8_second_half_char;
105 unsigned long write_marc8_last;
106 const char *write_marc8_page_chr;
110 unsigned long x1, x2;
113 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
114 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
115 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
116 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
117 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
118 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
119 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
120 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
121 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
122 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
123 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
124 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
125 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
126 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
127 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
128 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
129 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
130 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
131 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
132 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
133 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
134 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
135 /* omitted: 0xd7 MULTIPLICATION SIGN */
136 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
137 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
138 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
139 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
140 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
141 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
142 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
143 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
144 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
145 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
146 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
147 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
148 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
149 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
150 /* omitted: 0xe6 LATIN SMALL LETTER AE */
151 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
152 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
153 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
154 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
155 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
156 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
157 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
158 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
159 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
160 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
161 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
162 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
163 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
164 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
165 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
166 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
167 /* omitted: 0xf7 DIVISION SIGN */
168 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
169 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
170 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
171 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
172 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
173 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
174 /* omitted: 0xfe LATIN SMALL LETTER THORN */
175 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
180 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
181 size_t inbytesleft, size_t *no_read)
183 unsigned long x = inp[0];
189 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
190 size_t inbytesleft, size_t *no_read)
199 cd->my_errno = YAZ_ICONV_EINVAL;
202 if (inp[1] != 0xbb && inp[2] == 0xbf)
209 unsigned long yaz_read_UTF8_char(unsigned char *inp,
210 size_t inbytesleft, size_t *no_read,
220 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
223 *error = YAZ_ICONV_EILSEQ;
225 else if (inp[0] <= 0xdf && inbytesleft >= 2)
227 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
233 *error = YAZ_ICONV_EILSEQ;
236 else if (inp[0] <= 0xef && inbytesleft >= 3)
238 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
245 *error = YAZ_ICONV_EILSEQ;
248 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
250 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
251 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
257 *error = YAZ_ICONV_EILSEQ;
260 else if (inp[0] <= 0xfb && inbytesleft >= 5)
262 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
263 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
270 *error = YAZ_ICONV_EILSEQ;
273 else if (inp[0] <= 0xfd && inbytesleft >= 6)
275 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
276 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
277 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
283 *error = YAZ_ICONV_EILSEQ;
289 *error = YAZ_ICONV_EINVAL;
294 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
295 size_t inbytesleft, size_t *no_read)
297 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
300 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
301 size_t inbytesleft, size_t *no_read)
307 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
312 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
318 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
319 size_t inbytesleft, size_t *no_read)
325 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
330 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
337 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
338 size_t inbytesleft, size_t *no_read)
342 if (inbytesleft < sizeof(wchar_t))
344 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
350 memcpy (&wch, inp, sizeof(wch));
352 *no_read = sizeof(wch);
358 static unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp,
359 size_t inbytesleft, size_t *no_read)
367 while (inbytesleft > 0)
373 else if (*inp == 0x9e)
377 else if (*inp == 0x9f)
387 if (inbytesleft == 0)
389 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
617 static size_t yaz_write_advancegreek(yaz_iconv_t cd, unsigned long x,
618 char **outbuf, size_t *outbytesleft)
621 unsigned char *out = (unsigned char*) *outbuf;
622 if (*outbytesleft < 3)
624 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
629 case 0x03ac : out[k++]=0x9d; out[k++]=0x81; break;
630 case 0x03ad : out[k++]=0x9d; out[k++]=0x85; break;
631 case 0x03ae : out[k++]=0x9d; out[k++]=0x87; break;
632 case 0x03af : out[k++]=0x9d; out[k++]=0x89; break;
633 case 0x03cc : out[k++]=0x9d; out[k++]=0x8f; break;
634 case 0x03cd : out[k++]=0x9d; out[k++]=0x95; break;
635 case 0x03ce : out[k++]=0x9d; out[k++]=0x99; break;
636 case 0x0390 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x89; break;
637 case 0x03b0 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x95; break;
638 case 0x0386 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x81; break;
639 case 0x0388 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x85; break;
640 case 0x0389 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x87; break;
641 case 0x038a : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x89; break;
642 case 0x038c : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x8f; break;
643 case 0x038e : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x95; break;
644 case 0x038f : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x99; break;
645 case 0x03ca : out[k++]=0x9e; out[k++]=0x89; break;
646 case 0x03cb : out[k++]=0x9e; out[k++]=0x95; break;
647 case 0x03aa : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x89; break;
648 case 0x03ab : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x95; break;
649 case 0x0391 : out[k++]=0x9f; out[k++]=0x81; break;
650 case 0x0392 : out[k++]=0x9f; out[k++]=0x82; break;
651 case 0x0393 : out[k++]=0x9f; out[k++]=0x83; break;
652 case 0x0394 : out[k++]=0x9f; out[k++]=0x84; break;
653 case 0x0395 : out[k++]=0x9f; out[k++]=0x85; break;
654 case 0x0396 : out[k++]=0x9f; out[k++]=0x86; break;
655 case 0x0397 : out[k++]=0x9f; out[k++]=0x87; break;
656 case 0x0398 : out[k++]=0x9f; out[k++]=0x88; break;
657 case 0x0399 : out[k++]=0x9f; out[k++]=0x89; break;
658 case 0x039a : out[k++]=0x9f; out[k++]=0x8a; break;
659 case 0x039b : out[k++]=0x9f; out[k++]=0x8b; break;
660 case 0x039c : out[k++]=0x9f; out[k++]=0x8c; break;
661 case 0x039d : out[k++]=0x9f; out[k++]=0x8d; break;
662 case 0x039e : out[k++]=0x9f; out[k++]=0x8e; break;
663 case 0x039f : out[k++]=0x9f; out[k++]=0x8f; break;
664 case 0x03a0 : out[k++]=0x9f; out[k++]=0x90; break;
665 case 0x03a1 : out[k++]=0x9f; out[k++]=0x91; break;
666 case 0x03a3 : out[k++]=0x9f; out[k++]=0x93; break;
667 case 0x03a4 : out[k++]=0x9f; out[k++]=0x94; break;
668 case 0x03a5 : out[k++]=0x9f; out[k++]=0x95; break;
669 case 0x03a6 : out[k++]=0x9f; out[k++]=0x96; break;
670 case 0x03a7 : out[k++]=0x9f; out[k++]=0x97; break;
671 case 0x03a8 : out[k++]=0x9f; out[k++]=0x98; break;
672 case 0x03a9 : out[k++]=0x9f; out[k++]=0x99; break;
673 case 0x03b1 : out[k++]=0x81; break;
674 case 0x03b2 : out[k++]=0x82; break;
675 case 0x03b3 : out[k++]=0x83; break;
676 case 0x03b4 : out[k++]=0x84; break;
677 case 0x03b5 : out[k++]=0x85; break;
678 case 0x03b6 : out[k++]=0x86; break;
679 case 0x03b7 : out[k++]=0x87; break;
680 case 0x03b8 : out[k++]=0x88; break;
681 case 0x03b9 : out[k++]=0x89; break;
682 case 0x03ba : out[k++]=0x8a; break;
683 case 0x03bb : out[k++]=0x8b; break;
684 case 0x03bc : out[k++]=0x8c; break;
685 case 0x03bd : out[k++]=0x8d; break;
686 case 0x03be : out[k++]=0x8e; break;
687 case 0x03bf : out[k++]=0x8f; break;
688 case 0x03c0 : out[k++]=0x90; break;
689 case 0x03c1 : out[k++]=0x91; break;
690 case 0x03c2 : out[k++]=0x92; break;
691 case 0x03c3 : out[k++]=0x93; break;
692 case 0x03c4 : out[k++]=0x94; break;
693 case 0x03c5 : out[k++]=0x95; break;
694 case 0x03c6 : out[k++]=0x96; break;
695 case 0x03c7 : out[k++]=0x96; break;
696 case 0x03c8 : out[k++]=0x98; break;
697 case 0x03c9 : out[k++]=0x99; break;
701 cd->my_errno = YAZ_ICONV_EILSEQ;
713 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
714 size_t inbytesleft, size_t *no_read,
717 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
718 size_t inbytesleft, size_t *no_read)
721 if (cd->comb_offset < cd->comb_size)
723 *no_read = cd->comb_no_read[cd->comb_offset];
724 x = cd->comb_x[cd->comb_offset];
726 /* special case for double-diacritic combining characters,
727 INVERTED BREVE and DOUBLE TILDE.
728 We'll increment the no_read counter by 1, since we want to skip over
729 the processing of the closing ligature character
731 /* this code is no longer necessary.. our handlers code in
732 yaz_marc8_?_conv (generated by charconv.tcl) now returns
733 0 and no_read=1 when a sequence does not match the input.
734 The SECOND HALFs in codetables.xml produces a non-existant
735 entry in the conversion trie.. Hence when met, the input byte is
736 skipped as it should (in yaz_iconv)
739 if (x == 0x0361 || x == 0x0360)
747 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
751 if (inbytesleft == 0 && cd->comb_size)
753 cd->my_errno = YAZ_ICONV_EINVAL;
758 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
761 cd->comb_x[cd->comb_size] = x;
762 cd->comb_no_read[cd->comb_size] = *no_read;
764 inbytesleft = inbytesleft - *no_read;
769 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
770 size_t inbytesleft, size_t *no_read)
772 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
773 if (x && cd->comb_size == 1)
775 /* For MARC8s we try to get a Latin-1 page code out of it */
777 for (i = 0; latin1_comb[i].x1; i++)
778 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
780 *no_read += cd->comb_no_read[0];
782 x = latin1_comb[i].y;
789 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
790 size_t inbytesleft, size_t *no_read,
794 while(inbytesleft >= 1 && inp[0] == 27)
796 size_t inbytesleft0 = inbytesleft;
799 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
804 if (inbytesleft <= 0)
807 cd->my_errno = YAZ_ICONV_EINVAL;
810 cd->marc8_esc_mode = *inp++;
812 (*no_read) += inbytesleft0 - inbytesleft;
814 if (inbytesleft <= 0)
819 size_t no_read_sub = 0;
822 switch(cd->marc8_esc_mode)
824 case 'B': /* Basic ASCII */
825 case 'E': /* ANSEL */
826 case 's': /* ASCII */
827 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
829 case 'g': /* Greek */
830 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
832 case 'b': /* Subscripts */
833 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
835 case 'p': /* Superscripts */
836 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
838 case '2': /* Basic Hebrew */
839 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
841 case 'N': /* Basic Cyrillic */
842 case 'Q': /* Extended Cyrillic */
843 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
845 case '3': /* Basic Arabic */
846 case '4': /* Extended Arabic */
847 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
849 case 'S': /* Greek */
850 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
852 case '1': /* Chinese, Japanese, Korean (EACC) */
853 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
857 cd->my_errno = YAZ_ICONV_EILSEQ;
860 *no_read += no_read_sub;
865 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
866 char **outbuf, size_t *outbytesleft)
868 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
871 size_t yaz_write_UTF8_char(unsigned long x,
872 char **outbuf, size_t *outbytesleft,
875 unsigned char *outp = (unsigned char *) *outbuf;
877 if (x <= 0x7f && *outbytesleft >= 1)
879 *outp++ = (unsigned char) x;
882 else if (x <= 0x7ff && *outbytesleft >= 2)
884 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
885 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
886 (*outbytesleft) -= 2;
888 else if (x <= 0xffff && *outbytesleft >= 3)
890 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
891 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
892 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
893 (*outbytesleft) -= 3;
895 else if (x <= 0x1fffff && *outbytesleft >= 4)
897 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
898 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
899 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
900 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
901 (*outbytesleft) -= 4;
903 else if (x <= 0x3ffffff && *outbytesleft >= 5)
905 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
906 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
907 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
908 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
909 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
910 (*outbytesleft) -= 5;
912 else if (*outbytesleft >= 6)
914 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
915 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
916 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
917 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
918 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
919 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
920 (*outbytesleft) -= 6;
924 *error = YAZ_ICONV_E2BIG; /* not room for output */
927 *outbuf = (char *) outp;
931 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
932 char **outbuf, size_t *outbytesleft)
934 /* list of two char unicode sequence that, when combined, are
935 equivalent to single unicode chars that can be represented in
937 Regular iconv on Linux at least does not seem to convert these,
938 but since MARC-8 to UTF-8 generates these composed sequence
939 we get a better chance of a successful MARC-8 -> ISO-8859-1
941 unsigned char *outp = (unsigned char *) *outbuf;
943 if (cd->compose_char)
946 for (i = 0; latin1_comb[i].x1; i++)
947 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
949 x = latin1_comb[i].y;
952 if (*outbytesleft < 1)
953 { /* no room. Retain compose_char and bail out */
954 cd->my_errno = YAZ_ICONV_E2BIG;
957 if (!latin1_comb[i].x1)
958 { /* not found. Just write compose_char */
959 *outp++ = (unsigned char) cd->compose_char;
961 *outbuf = (char *) outp;
963 /* compose_char used so reset it. x now holds current char */
964 cd->compose_char = 0;
967 if (x > 32 && x < 127 && cd->compose_char == 0)
969 cd->compose_char = x;
972 else if (x > 255 || x < 1)
974 cd->my_errno = YAZ_ICONV_EILSEQ;
977 else if (*outbytesleft < 1)
979 cd->my_errno = YAZ_ICONV_E2BIG;
982 *outp++ = (unsigned char) x;
984 *outbuf = (char *) outp;
988 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
989 char **outbuf, size_t *outbytesleft)
991 if (cd->compose_char)
993 unsigned char *outp = (unsigned char *) *outbuf;
994 if (*outbytesleft < 1)
996 cd->my_errno = YAZ_ICONV_E2BIG;
999 *outp++ = (unsigned char) cd->compose_char;
1001 *outbuf = (char *) outp;
1002 cd->compose_char = 0;
1007 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
1008 char **outbuf, size_t *outbytesleft)
1010 unsigned char *outp = (unsigned char *) *outbuf;
1011 if (*outbytesleft >= 4)
1013 *outp++ = (unsigned char) (x>>24);
1014 *outp++ = (unsigned char) (x>>16);
1015 *outp++ = (unsigned char) (x>>8);
1016 *outp++ = (unsigned char) x;
1017 (*outbytesleft) -= 4;
1021 cd->my_errno = YAZ_ICONV_E2BIG;
1022 return (size_t)(-1);
1024 *outbuf = (char *) outp;
1028 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
1029 char **outbuf, size_t *outbytesleft)
1031 unsigned char *outp = (unsigned char *) *outbuf;
1032 if (*outbytesleft >= 4)
1034 *outp++ = (unsigned char) x;
1035 *outp++ = (unsigned char) (x>>8);
1036 *outp++ = (unsigned char) (x>>16);
1037 *outp++ = (unsigned char) (x>>24);
1038 (*outbytesleft) -= 4;
1042 cd->my_errno = YAZ_ICONV_E2BIG;
1043 return (size_t)(-1);
1045 *outbuf = (char *) outp;
1049 static unsigned long lookup_marc8(yaz_iconv_t cd,
1050 unsigned long x, int *comb,
1051 const char **page_chr)
1054 char *utf8_outbuf = utf8_buf;
1055 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
1057 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
1058 if (r == (size_t)(-1))
1060 cd->my_errno = YAZ_ICONV_EILSEQ;
1066 size_t inbytesleft, no_read_sub = 0;
1069 *utf8_outbuf = '\0';
1070 inp = (unsigned char *) utf8_buf;
1071 inbytesleft = strlen(utf8_buf);
1073 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
1076 *page_chr = "\033(B";
1079 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
1082 *page_chr = "\033g";
1085 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
1088 *page_chr = "\033b";
1091 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
1094 *page_chr = "\033p";
1097 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
1100 *page_chr = "\033(2";
1103 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
1106 *page_chr = "\033(N";
1109 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
1112 *page_chr = "\033(3";
1115 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
1118 *page_chr = "\033(S";
1121 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
1124 *page_chr = "\033$1";
1127 cd->my_errno = YAZ_ICONV_EILSEQ;
1132 static size_t flush_combos(yaz_iconv_t cd,
1133 char **outbuf, size_t *outbytesleft)
1135 unsigned long y = cd->write_marc8_last;
1138 size_t i, out_no = 0;
1143 byte = (unsigned char )((y>>16) & 0xff);
1145 out_buf[out_no++] = byte;
1146 byte = (unsigned char)((y>>8) & 0xff);
1148 out_buf[out_no++] = byte;
1149 byte = (unsigned char )(y & 0xff);
1151 out_buf[out_no++] = byte;
1153 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
1155 cd->my_errno = YAZ_ICONV_E2BIG;
1156 return (size_t) (-1);
1159 for (i = 0; i < cd->write_marc8_comb_no; i++)
1161 /* all MARC-8 combined characters are simple bytes */
1162 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
1163 *(*outbuf)++ = byte;
1166 memcpy(*outbuf, out_buf, out_no);
1168 (*outbytesleft) -= out_no;
1169 if (cd->write_marc8_second_half_char)
1171 *(*outbuf)++ = cd->write_marc8_second_half_char;
1175 cd->write_marc8_last = 0;
1176 cd->write_marc8_comb_no = 0;
1177 cd->write_marc8_second_half_char = 0;
1181 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
1182 char **outbuf, size_t *outbytesleft,
1183 const char *page_chr)
1185 const char *old_page_chr = cd->write_marc8_page_chr;
1186 if (strcmp(page_chr, old_page_chr))
1189 const char *page_out = page_chr;
1191 if (*outbytesleft < 8)
1193 cd->my_errno = YAZ_ICONV_E2BIG;
1195 return (size_t) (-1);
1197 cd->write_marc8_page_chr = page_chr;
1199 if (!strcmp(old_page_chr, "\033p")
1200 || !strcmp(old_page_chr, "\033g")
1201 || !strcmp(old_page_chr, "\033b"))
1203 /* Technique 1 leave */
1205 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
1207 /* Must leave script + enter new page */
1208 plen = strlen(page_out);
1209 memcpy(*outbuf, page_out, plen);
1211 (*outbytesleft) -= plen;
1212 page_out = page_chr;
1215 plen = strlen(page_out);
1216 memcpy(*outbuf, page_out, plen);
1218 (*outbytesleft) -= plen;
1224 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
1225 char **outbuf, size_t *outbytesleft)
1228 const char *page_chr = 0;
1229 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
1232 return (size_t) (-1);
1237 cd->write_marc8_second_half_char = 0xEC;
1238 else if (x == 0x0360)
1239 cd->write_marc8_second_half_char = 0xFB;
1241 if (cd->write_marc8_comb_no < 6)
1242 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
1246 size_t r = flush_combos(cd, outbuf, outbytesleft);
1250 r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr);
1253 cd->write_marc8_last = y;
1258 static size_t yaz_flush_marc8(yaz_iconv_t cd,
1259 char **outbuf, size_t *outbytesleft)
1261 size_t r = flush_combos(cd, outbuf, outbytesleft);
1264 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, "\033(B");
1267 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
1268 char **outbuf, size_t *outbytesleft)
1271 for (i = 0; latin1_comb[i].x1; i++)
1273 if (x == latin1_comb[i].y)
1276 /* save the output pointers .. */
1277 char *outbuf0 = *outbuf;
1278 size_t outbytesleft0 = *outbytesleft;
1279 int last_ch = cd->write_marc8_last;
1281 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
1282 outbuf, outbytesleft);
1285 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
1286 outbuf, outbytesleft);
1287 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
1289 /* not enough room. reset output to original values */
1291 *outbytesleft = outbytesleft0;
1292 cd->write_marc8_last = last_ch;
1297 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
1302 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
1303 char **outbuf, size_t *outbytesleft)
1305 unsigned char *outp = (unsigned char *) *outbuf;
1307 if (*outbytesleft >= sizeof(wchar_t))
1310 memcpy(outp, &wch, sizeof(wch));
1311 outp += sizeof(wch);
1312 (*outbytesleft) -= sizeof(wch);
1316 cd->my_errno = YAZ_ICONV_E2BIG;
1317 return (size_t)(-1);
1319 *outbuf = (char *) outp;
1324 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
1326 return cd->read_handle && cd->write_handle;
1329 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
1331 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
1333 cd->write_handle = 0;
1334 cd->read_handle = 0;
1335 cd->init_handle = 0;
1336 cd->flush_handle = 0;
1337 cd->my_errno = YAZ_ICONV_UNKNOWN;
1339 /* a useful hack: if fromcode has leading @,
1340 the library not use YAZ's own conversions .. */
1341 if (fromcode[0] == '@')
1345 if (!yaz_matchstr(fromcode, "UTF8"))
1347 cd->read_handle = yaz_read_UTF8;
1348 cd->init_handle = yaz_init_UTF8;
1350 else if (!yaz_matchstr(fromcode, "ISO88591"))
1351 cd->read_handle = yaz_read_ISO8859_1;
1352 else if (!yaz_matchstr(fromcode, "UCS4"))
1353 cd->read_handle = yaz_read_UCS4;
1354 else if (!yaz_matchstr(fromcode, "UCS4LE"))
1355 cd->read_handle = yaz_read_UCS4LE;
1356 else if (!yaz_matchstr(fromcode, "MARC8"))
1357 cd->read_handle = yaz_read_marc8;
1358 else if (!yaz_matchstr(fromcode, "MARC8s"))
1359 cd->read_handle = yaz_read_marc8s;
1360 else if (!yaz_matchstr(fromcode, "advancegreek"))
1361 cd->read_handle = yaz_read_advancegreek;
1363 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1364 cd->read_handle = yaz_read_wchar_t;
1367 if (!yaz_matchstr(tocode, "UTF8"))
1368 cd->write_handle = yaz_write_UTF8;
1369 else if (!yaz_matchstr(tocode, "ISO88591"))
1371 cd->write_handle = yaz_write_ISO8859_1;
1372 cd->flush_handle = yaz_flush_ISO8859_1;
1374 else if (!yaz_matchstr (tocode, "UCS4"))
1375 cd->write_handle = yaz_write_UCS4;
1376 else if (!yaz_matchstr(tocode, "UCS4LE"))
1377 cd->write_handle = yaz_write_UCS4LE;
1378 else if (!yaz_matchstr(tocode, "MARC8"))
1380 cd->write_handle = yaz_write_marc8;
1381 cd->flush_handle = yaz_flush_marc8;
1383 else if (!yaz_matchstr(tocode, "MARC8s"))
1385 cd->write_handle = yaz_write_marc8;
1386 cd->flush_handle = yaz_flush_marc8;
1388 else if (!yaz_matchstr(tocode, "advancegreek"))
1390 cd->write_handle = yaz_write_advancegreek;
1393 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1394 cd->write_handle = yaz_write_wchar_t;
1399 if (!cd->read_handle || !cd->write_handle)
1401 cd->iconv_cd = iconv_open (tocode, fromcode);
1402 if (cd->iconv_cd == (iconv_t) (-1))
1409 if (!cd->read_handle || !cd->write_handle)
1419 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1420 char **outbuf, size_t *outbytesleft)
1429 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1430 if (r == (size_t)(-1))
1432 switch (yaz_errno())
1435 cd->my_errno = YAZ_ICONV_E2BIG;
1438 cd->my_errno = YAZ_ICONV_EINVAL;
1441 cd->my_errno = YAZ_ICONV_EILSEQ;
1444 cd->my_errno = YAZ_ICONV_UNKNOWN;
1456 cd->my_errno = YAZ_ICONV_UNKNOWN;
1457 cd->marc8_esc_mode = 'B';
1459 cd->comb_offset = cd->comb_size = 0;
1460 cd->compose_char = 0;
1462 cd->write_marc8_comb_no = 0;
1463 cd->write_marc8_second_half_char = 0;
1464 cd->write_marc8_last = 0;
1465 cd->write_marc8_page_chr = "\033(B";
1473 if (cd->init_handle && inbuf && *inbuf)
1476 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1477 *inbytesleft, &no_read);
1480 if (cd->my_errno == YAZ_ICONV_EINVAL)
1485 *inbytesleft -= no_read;
1491 if (!inbuf || !*inbuf)
1493 if (outbuf && *outbuf)
1496 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1497 if (cd->flush_handle)
1498 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1513 no_read = cd->no_read_x;
1517 if (*inbytesleft == 0)
1519 r = *inbuf - inbuf0;
1522 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1532 r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1535 /* unable to write it. save it because read_handle cannot
1537 if (cd->my_errno == YAZ_ICONV_E2BIG)
1540 cd->no_read_x = no_read;
1546 *inbytesleft -= no_read;
1547 (*inbuf) += no_read;
1552 int yaz_iconv_error (yaz_iconv_t cd)
1554 return cd->my_errno;
1557 int yaz_iconv_close (yaz_iconv_t cd)
1561 iconv_close (cd->iconv_cd);
1570 * indent-tabs-mode: nil
1572 * vim: shiftwidth=4 tabstop=8 expandtab