2 * Copyright (C) 1995-2008, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
36 #include <yaz/xmalloc.h>
40 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining,
42 unsigned mask, int boffset);
45 yaz_conv_func_t yaz_marc8_42_conv;
46 yaz_conv_func_t yaz_marc8_45_conv;
47 yaz_conv_func_t yaz_marc8_67_conv;
48 yaz_conv_func_t yaz_marc8_62_conv;
49 yaz_conv_func_t yaz_marc8_70_conv;
50 yaz_conv_func_t yaz_marc8_32_conv;
51 yaz_conv_func_t yaz_marc8_4E_conv;
52 yaz_conv_func_t yaz_marc8_51_conv;
53 yaz_conv_func_t yaz_marc8_33_conv;
54 yaz_conv_func_t yaz_marc8_34_conv;
55 yaz_conv_func_t yaz_marc8_53_conv;
56 yaz_conv_func_t yaz_marc8_31_conv;
58 yaz_conv_func_t yaz_marc8r_42_conv;
59 yaz_conv_func_t yaz_marc8r_45_conv;
60 yaz_conv_func_t yaz_marc8r_67_conv;
61 yaz_conv_func_t yaz_marc8r_62_conv;
62 yaz_conv_func_t yaz_marc8r_70_conv;
63 yaz_conv_func_t yaz_marc8r_32_conv;
64 yaz_conv_func_t yaz_marc8r_4E_conv;
65 yaz_conv_func_t yaz_marc8r_51_conv;
66 yaz_conv_func_t yaz_marc8r_33_conv;
67 yaz_conv_func_t yaz_marc8r_34_conv;
68 yaz_conv_func_t yaz_marc8r_53_conv;
69 yaz_conv_func_t yaz_marc8r_31_conv;
71 struct yaz_iconv_struct {
74 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
75 size_t inbytesleft, size_t *no_read);
76 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
79 char **outbuf, size_t *outbytesleft);
80 size_t (*flush_handle)(yaz_iconv_t cd,
81 char **outbuf, size_t *outbytesleft);
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned write_marc8_second_half_char;
97 unsigned long write_marc8_last;
98 const char *write_marc8_lpage;
99 const char *write_marc8_g0;
100 const char *write_marc8_g1;
105 unsigned long x1, x2;
108 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
109 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
110 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
111 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
112 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
113 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
114 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
115 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
116 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
117 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
118 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
119 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
120 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
121 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
122 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
123 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
124 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
125 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
126 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
127 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
128 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
129 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
130 /* omitted: 0xd7 MULTIPLICATION SIGN */
131 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
132 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
133 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
134 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
135 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
136 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
137 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
138 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
139 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
140 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
141 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
142 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
143 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
144 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
145 /* omitted: 0xe6 LATIN SMALL LETTER AE */
146 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
147 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
148 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
149 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
150 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
151 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
152 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
153 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
154 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
155 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
156 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
157 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
158 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
159 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
160 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
161 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
162 /* omitted: 0xf7 DIVISION SIGN */
163 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
164 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
165 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
166 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
167 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
168 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
169 /* omitted: 0xfe LATIN SMALL LETTER THORN */
170 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
177 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
178 char **outbuf, size_t *outbytesleft,
179 const char *page_chr);
181 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
184 unsigned long x = inp[0];
192 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
193 size_t inbytesleft, size_t *no_read)
197 if (inbytesleft < sizeof(wchar_t))
199 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
205 memcpy (&wch, inp, sizeof(wch));
207 *no_read = sizeof(wch);
214 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
215 size_t inbytesleft, size_t *no_read,
218 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
219 size_t inbytesleft, size_t *no_read)
222 if (cd->comb_offset < cd->comb_size)
224 *no_read = cd->comb_no_read[cd->comb_offset];
225 x = cd->comb_x[cd->comb_offset];
227 /* special case for double-diacritic combining characters,
228 INVERTED BREVE and DOUBLE TILDE.
229 We'll increment the no_read counter by 1, since we want to skip over
230 the processing of the closing ligature character
232 /* this code is no longer necessary.. our handlers code in
233 yaz_marc8_?_conv (generated by charconv.tcl) now returns
234 0 and no_read=1 when a sequence does not match the input.
235 The SECOND HALFs in codetables.xml produces a non-existant
236 entry in the conversion trie.. Hence when met, the input byte is
237 skipped as it should (in yaz_iconv)
240 if (x == 0x0361 || x == 0x0360)
248 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
252 if (inbytesleft == 0 && cd->comb_size)
254 cd->my_errno = YAZ_ICONV_EINVAL;
259 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
262 cd->comb_x[cd->comb_size] = x;
263 cd->comb_no_read[cd->comb_size] = *no_read;
265 inbytesleft = inbytesleft - *no_read;
270 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
271 size_t inbytesleft, size_t *no_read)
273 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
274 if (x && cd->comb_size == 1)
276 /* For MARC8s we try to get a Latin-1 page code out of it */
278 for (i = 0; latin1_comb[i].x1; i++)
279 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
281 *no_read += cd->comb_no_read[0];
283 x = latin1_comb[i].y;
290 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
291 size_t inbytesleft, size_t *no_read,
295 while (inbytesleft > 0 && *inp == 27)
297 int *modep = &cd->g0_mode;
298 size_t inbytesleft0 = inbytesleft;
302 if (inbytesleft == 0)
304 if (*inp == '$') /* set with multiple bytes */
309 if (inbytesleft == 0)
311 if (*inp == '(' || *inp == ',') /* G0 */
316 else if (*inp == ')' || *inp == '-') /* G1 */
320 modep = &cd->g1_mode;
322 if (inbytesleft == 0)
324 if (*inp == '!') /* ANSEL is a special case */
329 if (inbytesleft == 0)
331 *modep = *inp++; /* Final character */
334 (*no_read) += inbytesleft0 - inbytesleft;
336 if (inbytesleft == 0)
338 else if (*inp == ' ')
346 size_t no_read_sub = 0;
347 int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
352 case 'B': /* Basic ASCII */
353 case 's': /* ASCII */
354 x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
356 case 'E': /* ANSEL */
357 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
359 case 'g': /* Greek */
360 x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
362 case 'b': /* Subscripts */
363 x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
365 case 'p': /* Superscripts */
366 x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
368 case '2': /* Basic Hebrew */
369 x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
371 case 'N': /* Basic Cyrillic */
372 x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
374 case 'Q': /* Extended Cyrillic */
375 x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
377 case '3': /* Basic Arabic */
378 x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
380 case '4': /* Extended Arabic */
381 x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
383 case 'S': /* Greek */
384 x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
386 case '1': /* Chinese, Japanese, Korean (EACC) */
387 x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
391 cd->my_errno = YAZ_ICONV_EILSEQ;
394 *no_read += no_read_sub;
399 cd->my_errno = YAZ_ICONV_EINVAL;
403 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
404 char **outbuf, size_t *outbytesleft)
406 /* list of two char unicode sequence that, when combined, are
407 equivalent to single unicode chars that can be represented in
409 Regular iconv on Linux at least does not seem to convert these,
410 but since MARC-8 to UTF-8 generates these composed sequence
411 we get a better chance of a successful MARC-8 -> ISO-8859-1
413 unsigned char *outp = (unsigned char *) *outbuf;
415 if (cd->compose_char)
418 for (i = 0; latin1_comb[i].x1; i++)
419 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
421 x = latin1_comb[i].y;
424 if (*outbytesleft < 1)
425 { /* no room. Retain compose_char and bail out */
426 cd->my_errno = YAZ_ICONV_E2BIG;
429 if (!latin1_comb[i].x1)
430 { /* not found. Just write compose_char */
431 *outp++ = (unsigned char) cd->compose_char;
433 *outbuf = (char *) outp;
435 /* compose_char used so reset it. x now holds current char */
436 cd->compose_char = 0;
439 if (x > 32 && x < 127 && cd->compose_char == 0)
441 cd->compose_char = x;
444 else if (x > 255 || x < 1)
446 cd->my_errno = YAZ_ICONV_EILSEQ;
449 else if (*outbytesleft < 1)
451 cd->my_errno = YAZ_ICONV_E2BIG;
454 *outp++ = (unsigned char) x;
456 *outbuf = (char *) outp;
460 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
461 char **outbuf, size_t *outbytesleft)
463 if (cd->compose_char)
465 unsigned char *outp = (unsigned char *) *outbuf;
466 if (*outbytesleft < 1)
468 cd->my_errno = YAZ_ICONV_E2BIG;
471 *outp++ = (unsigned char) cd->compose_char;
473 *outbuf = (char *) outp;
474 cd->compose_char = 0;
479 static unsigned long lookup_marc8(yaz_iconv_t cd,
480 unsigned long x, int *comb,
481 const char **page_chr)
484 char *utf8_outbuf = utf8_buf;
485 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
487 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
488 if (r == (size_t)(-1))
490 cd->my_errno = YAZ_ICONV_EILSEQ;
496 size_t inbytesleft, no_read_sub = 0;
500 inp = (unsigned char *) utf8_buf;
501 inbytesleft = strlen(utf8_buf);
503 x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
506 *page_chr = ESC "(B";
509 x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
512 *page_chr = ESC "(B";
515 x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
521 x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
527 x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
530 *page_chr = ESC "(2";
533 x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
536 *page_chr = ESC "(N";
539 x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
542 *page_chr = ESC "(Q";
545 x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
548 *page_chr = ESC "(3";
551 x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
554 *page_chr = ESC "(4";
557 x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
560 *page_chr = ESC "(S";
563 x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
566 *page_chr = ESC "$1";
569 cd->my_errno = YAZ_ICONV_EILSEQ;
574 static size_t flush_combos(yaz_iconv_t cd,
575 char **outbuf, size_t *outbytesleft)
577 unsigned long y = cd->write_marc8_last;
585 assert(cd->write_marc8_lpage);
586 if (cd->write_marc8_lpage)
588 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
589 cd->write_marc8_lpage);
594 byte = (unsigned char )((y>>16) & 0xff);
596 out_buf[out_no++] = byte;
597 byte = (unsigned char)((y>>8) & 0xff);
599 out_buf[out_no++] = byte;
600 byte = (unsigned char )(y & 0xff);
602 out_buf[out_no++] = byte;
604 if (out_no + 2 >= *outbytesleft)
606 cd->my_errno = YAZ_ICONV_E2BIG;
607 return (size_t) (-1);
610 memcpy(*outbuf, out_buf, out_no);
612 (*outbytesleft) -= out_no;
613 if (cd->write_marc8_second_half_char)
615 *(*outbuf)++ = cd->write_marc8_second_half_char;
619 cd->write_marc8_last = 0;
620 cd->write_marc8_lpage = 0;
621 cd->write_marc8_second_half_char = 0;
625 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
626 char **outbuf, size_t *outbytesleft,
627 const char *page_chr)
629 const char **old_page_chr = &cd->write_marc8_g0;
631 /* are we going to a G1-set (such as such as ESC ")!E") */
632 if (page_chr && page_chr[1] == ')')
633 old_page_chr = &cd->write_marc8_g1;
635 if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
638 const char *page_out = page_chr;
640 if (*outbytesleft < 8)
642 cd->my_errno = YAZ_ICONV_E2BIG;
644 return (size_t) (-1);
649 if (!strcmp(*old_page_chr, ESC "p")
650 || !strcmp(*old_page_chr, ESC "g")
651 || !strcmp(*old_page_chr, ESC "b"))
654 /* Technique 1 leave */
655 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
657 /* Must leave script + enter new page */
658 plen = strlen(page_out);
659 memcpy(*outbuf, page_out, plen);
661 (*outbytesleft) -= plen;
666 *old_page_chr = page_chr;
667 plen = strlen(page_out);
668 memcpy(*outbuf, page_out, plen);
670 (*outbytesleft) -= plen;
676 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
677 char **outbuf, size_t *outbytesleft)
680 const char *page_chr = 0;
681 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
684 return (size_t) (-1);
690 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
696 cd->write_marc8_second_half_char = 0xEC;
697 else if (x == 0x0360)
698 cd->write_marc8_second_half_char = 0xFB;
700 if (*outbytesleft <= 1)
702 cd->my_errno = YAZ_ICONV_E2BIG;
703 return (size_t) (-1);
710 size_t r = flush_combos(cd, outbuf, outbytesleft);
714 cd->write_marc8_last = y;
715 cd->write_marc8_lpage = page_chr;
720 static size_t yaz_flush_marc8(yaz_iconv_t cd,
721 char **outbuf, size_t *outbytesleft)
723 size_t r = flush_combos(cd, outbuf, outbytesleft);
726 cd->write_marc8_g1 = 0;
727 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
730 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
731 char **outbuf, size_t *outbytesleft)
734 for (i = 0; latin1_comb[i].x1; i++)
736 if (x == latin1_comb[i].y)
739 /* save the output pointers .. */
740 char *outbuf0 = *outbuf;
741 size_t outbytesleft0 = *outbytesleft;
742 int last_ch = cd->write_marc8_last;
743 const char *lpage = cd->write_marc8_lpage;
745 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
746 outbuf, outbytesleft);
749 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
750 outbuf, outbytesleft);
751 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
753 /* not enough room. reset output to original values */
755 *outbytesleft = outbytesleft0;
756 cd->write_marc8_last = last_ch;
757 cd->write_marc8_lpage = lpage;
762 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
767 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
768 char **outbuf, size_t *outbytesleft)
770 unsigned char *outp = (unsigned char *) *outbuf;
772 if (*outbytesleft >= sizeof(wchar_t))
775 memcpy(outp, &wch, sizeof(wch));
777 (*outbytesleft) -= sizeof(wch);
781 cd->my_errno = YAZ_ICONV_E2BIG;
784 *outbuf = (char *) outp;
789 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
791 return cd->read_handle && cd->write_handle;
794 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
796 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
798 cd->write_handle = 0;
801 cd->flush_handle = 0;
802 cd->my_errno = YAZ_ICONV_UNKNOWN;
804 /* a useful hack: if fromcode has leading @,
805 the library not use YAZ's own conversions .. */
806 if (fromcode[0] == '@')
810 if (!yaz_matchstr(fromcode, "UTF8"))
812 cd->read_handle = yaz_read_UTF8;
813 cd->init_handle = yaz_init_UTF8;
815 else if (!yaz_matchstr(fromcode, "ISO88591"))
816 cd->read_handle = yaz_read_ISO8859_1;
817 else if (!yaz_matchstr(fromcode, "UCS4"))
818 cd->read_handle = yaz_read_UCS4;
819 else if (!yaz_matchstr(fromcode, "UCS4LE"))
820 cd->read_handle = yaz_read_UCS4LE;
821 else if (!yaz_matchstr(fromcode, "MARC8"))
822 cd->read_handle = yaz_read_marc8;
823 else if (!yaz_matchstr(fromcode, "MARC8s"))
824 cd->read_handle = yaz_read_marc8s;
825 else if (!yaz_matchstr(fromcode, "advancegreek"))
826 cd->read_handle = yaz_read_advancegreek;
827 else if (!yaz_matchstr(fromcode, "iso54281984"))
828 cd->read_handle = yaz_read_iso5428_1984;
829 else if (!yaz_matchstr(fromcode, "iso5428:1984"))
830 cd->read_handle = yaz_read_iso5428_1984;
832 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
833 cd->read_handle = yaz_read_wchar_t;
836 if (!yaz_matchstr(tocode, "UTF8"))
837 cd->write_handle = yaz_write_UTF8;
838 else if (!yaz_matchstr(tocode, "ISO88591"))
840 cd->write_handle = yaz_write_ISO8859_1;
841 cd->flush_handle = yaz_flush_ISO8859_1;
843 else if (!yaz_matchstr (tocode, "UCS4"))
844 cd->write_handle = yaz_write_UCS4;
845 else if (!yaz_matchstr(tocode, "UCS4LE"))
846 cd->write_handle = yaz_write_UCS4LE;
847 else if (!yaz_matchstr(tocode, "MARC8"))
849 cd->write_handle = yaz_write_marc8;
850 cd->flush_handle = yaz_flush_marc8;
852 else if (!yaz_matchstr(tocode, "MARC8s"))
854 cd->write_handle = yaz_write_marc8;
855 cd->flush_handle = yaz_flush_marc8;
857 else if (!yaz_matchstr(tocode, "advancegreek"))
859 cd->write_handle = yaz_write_advancegreek;
861 else if (!yaz_matchstr(tocode, "iso54281984"))
863 cd->write_handle = yaz_write_iso5428_1984;
865 else if (!yaz_matchstr(tocode, "iso5428:1984"))
867 cd->write_handle = yaz_write_iso5428_1984;
870 else if (!yaz_matchstr(tocode, "WCHAR_T"))
871 cd->write_handle = yaz_write_wchar_t;
876 if (!cd->read_handle || !cd->write_handle)
878 cd->iconv_cd = iconv_open (tocode, fromcode);
879 if (cd->iconv_cd == (iconv_t) (-1))
886 if (!cd->read_handle || !cd->write_handle)
896 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
897 char **outbuf, size_t *outbytesleft)
906 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
907 if (r == (size_t)(-1))
912 cd->my_errno = YAZ_ICONV_E2BIG;
915 cd->my_errno = YAZ_ICONV_EINVAL;
918 cd->my_errno = YAZ_ICONV_EILSEQ;
921 cd->my_errno = YAZ_ICONV_UNKNOWN;
933 cd->my_errno = YAZ_ICONV_UNKNOWN;
937 cd->comb_offset = cd->comb_size = 0;
938 cd->compose_char = 0;
940 cd->write_marc8_second_half_char = 0;
941 cd->write_marc8_last = 0;
942 cd->write_marc8_lpage = 0;
943 cd->write_marc8_g0 = ESC "(B";
944 cd->write_marc8_g1 = 0;
952 if (cd->init_handle && inbuf && *inbuf)
955 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
956 *inbytesleft, &no_read);
959 if (cd->my_errno == YAZ_ICONV_EINVAL)
964 *inbytesleft -= no_read;
970 if (!inbuf || !*inbuf)
972 if (outbuf && *outbuf)
975 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
976 if (cd->flush_handle)
977 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
992 no_read = cd->no_read_x;
996 if (*inbytesleft == 0)
1001 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1011 r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1014 /* unable to write it. save it because read_handle cannot
1016 if (cd->my_errno == YAZ_ICONV_E2BIG)
1019 cd->no_read_x = no_read;
1025 *inbytesleft -= no_read;
1026 (*inbuf) += no_read;
1031 int yaz_iconv_error (yaz_iconv_t cd)
1033 return cd->my_errno;
1036 int yaz_iconv_close (yaz_iconv_t cd)
1040 iconv_close (cd->iconv_cd);
1046 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1054 * indent-tabs-mode: nil
1056 * vim: shiftwidth=4 tabstop=8 expandtab