2 * Copyright (C) 1995-2008, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
36 #include <yaz/xmalloc.h>
41 unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
57 unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft,
58 size_t *no_read, int *combining);
59 unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
77 unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft,
78 size_t *no_read, int *combining);
79 unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft,
80 size_t *no_read, int *combining);
81 unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft,
82 size_t *no_read, int *combining);
83 unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft,
84 size_t *no_read, int *combining);
85 unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft,
86 size_t *no_read, int *combining);
87 unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft,
88 size_t *no_read, int *combining);
89 unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft,
90 size_t *no_read, int *combining);
92 struct yaz_iconv_struct {
95 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
96 size_t inbytesleft, size_t *no_read);
97 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
98 size_t inbytesleft, size_t *no_read);
99 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
100 char **outbuf, size_t *outbytesleft);
101 size_t (*flush_handle)(yaz_iconv_t cd,
102 char **outbuf, size_t *outbytesleft);
108 unsigned long comb_x[8];
109 size_t comb_no_read[8];
111 unsigned long unget_x;
115 unsigned long compose_char;
117 unsigned write_marc8_second_half_char;
118 unsigned long write_marc8_last;
119 const char *write_marc8_lpage;
120 const char *write_marc8_g0;
121 const char *write_marc8_g1;
126 unsigned long x1, x2;
129 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
130 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
131 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
132 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
133 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
134 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
135 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
136 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
137 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
138 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
139 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
140 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
141 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
142 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
143 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
144 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
145 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
146 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
147 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
148 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
149 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
150 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
151 /* omitted: 0xd7 MULTIPLICATION SIGN */
152 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
153 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
154 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
155 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
156 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
157 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
158 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
159 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
160 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
161 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
162 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
163 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
164 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
165 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
166 /* omitted: 0xe6 LATIN SMALL LETTER AE */
167 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
168 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
169 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
170 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
171 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
172 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
173 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
174 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
175 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
176 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
177 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
178 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
179 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
180 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
181 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
182 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
183 /* omitted: 0xf7 DIVISION SIGN */
184 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
185 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
186 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
187 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
188 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
189 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
190 /* omitted: 0xfe LATIN SMALL LETTER THORN */
191 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
198 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
199 char **outbuf, size_t *outbytesleft,
200 const char *page_chr);
202 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
203 size_t inbytesleft, size_t *no_read)
205 unsigned long x = inp[0];
213 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
214 size_t inbytesleft, size_t *no_read)
218 if (inbytesleft < sizeof(wchar_t))
220 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
226 memcpy (&wch, inp, sizeof(wch));
228 *no_read = sizeof(wch);
235 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
236 size_t inbytesleft, size_t *no_read,
239 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
240 size_t inbytesleft, size_t *no_read)
243 if (cd->comb_offset < cd->comb_size)
245 *no_read = cd->comb_no_read[cd->comb_offset];
246 x = cd->comb_x[cd->comb_offset];
248 /* special case for double-diacritic combining characters,
249 INVERTED BREVE and DOUBLE TILDE.
250 We'll increment the no_read counter by 1, since we want to skip over
251 the processing of the closing ligature character
253 /* this code is no longer necessary.. our handlers code in
254 yaz_marc8_?_conv (generated by charconv.tcl) now returns
255 0 and no_read=1 when a sequence does not match the input.
256 The SECOND HALFs in codetables.xml produces a non-existant
257 entry in the conversion trie.. Hence when met, the input byte is
258 skipped as it should (in yaz_iconv)
261 if (x == 0x0361 || x == 0x0360)
269 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
273 if (inbytesleft == 0 && cd->comb_size)
275 cd->my_errno = YAZ_ICONV_EINVAL;
280 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
283 cd->comb_x[cd->comb_size] = x;
284 cd->comb_no_read[cd->comb_size] = *no_read;
286 inbytesleft = inbytesleft - *no_read;
291 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
292 size_t inbytesleft, size_t *no_read)
294 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
295 if (x && cd->comb_size == 1)
297 /* For MARC8s we try to get a Latin-1 page code out of it */
299 for (i = 0; latin1_comb[i].x1; i++)
300 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
302 *no_read += cd->comb_no_read[0];
304 x = latin1_comb[i].y;
311 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
312 size_t inbytesleft, size_t *no_read,
316 while (inbytesleft > 0 && *inp == 27)
318 int *modep = &cd->g0_mode;
319 size_t inbytesleft0 = inbytesleft;
323 if (inbytesleft == 0)
325 if (*inp == '$') /* set with multiple bytes */
330 if (inbytesleft == 0)
332 if (*inp == '(' || *inp == ',') /* G0 */
337 else if (*inp == ')' || *inp == '-') /* G1 */
341 modep = &cd->g1_mode;
343 if (inbytesleft == 0)
345 if (*inp == '!') /* ANSEL is a special case */
350 if (inbytesleft == 0)
352 *modep = *inp++; /* Final character */
355 (*no_read) += inbytesleft0 - inbytesleft;
357 if (inbytesleft == 0)
359 else if (*inp == ' ')
367 size_t no_read_sub = 0;
368 int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
373 case 'B': /* Basic ASCII */
374 case 's': /* ASCII */
375 case 'E': /* ANSEL */
376 x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb);
380 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb);
383 case 'g': /* Greek */
384 x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb);
386 case 'b': /* Subscripts */
387 x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb);
389 case 'p': /* Superscripts */
390 x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb);
392 case '2': /* Basic Hebrew */
393 x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb);
395 case 'N': /* Basic Cyrillic */
396 x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb);
398 case 'Q': /* Extended Cyrillic */
399 x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb);
401 case '3': /* Basic Arabic */
402 x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb);
404 case '4': /* Extended Arabic */
405 x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb);
407 case 'S': /* Greek */
408 x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb);
410 case '1': /* Chinese, Japanese, Korean (EACC) */
411 x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb);
415 cd->my_errno = YAZ_ICONV_EILSEQ;
418 *no_read += no_read_sub;
423 cd->my_errno = YAZ_ICONV_EINVAL;
427 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
428 char **outbuf, size_t *outbytesleft)
430 /* list of two char unicode sequence that, when combined, are
431 equivalent to single unicode chars that can be represented in
433 Regular iconv on Linux at least does not seem to convert these,
434 but since MARC-8 to UTF-8 generates these composed sequence
435 we get a better chance of a successful MARC-8 -> ISO-8859-1
437 unsigned char *outp = (unsigned char *) *outbuf;
439 if (cd->compose_char)
442 for (i = 0; latin1_comb[i].x1; i++)
443 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
445 x = latin1_comb[i].y;
448 if (*outbytesleft < 1)
449 { /* no room. Retain compose_char and bail out */
450 cd->my_errno = YAZ_ICONV_E2BIG;
453 if (!latin1_comb[i].x1)
454 { /* not found. Just write compose_char */
455 *outp++ = (unsigned char) cd->compose_char;
457 *outbuf = (char *) outp;
459 /* compose_char used so reset it. x now holds current char */
460 cd->compose_char = 0;
463 if (x > 32 && x < 127 && cd->compose_char == 0)
465 cd->compose_char = x;
468 else if (x > 255 || x < 1)
470 cd->my_errno = YAZ_ICONV_EILSEQ;
473 else if (*outbytesleft < 1)
475 cd->my_errno = YAZ_ICONV_E2BIG;
478 *outp++ = (unsigned char) x;
480 *outbuf = (char *) outp;
484 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
485 char **outbuf, size_t *outbytesleft)
487 if (cd->compose_char)
489 unsigned char *outp = (unsigned char *) *outbuf;
490 if (*outbytesleft < 1)
492 cd->my_errno = YAZ_ICONV_E2BIG;
495 *outp++ = (unsigned char) cd->compose_char;
497 *outbuf = (char *) outp;
498 cd->compose_char = 0;
503 static unsigned long lookup_marc8(yaz_iconv_t cd,
504 unsigned long x, int *comb,
505 const char **page_chr)
508 char *utf8_outbuf = utf8_buf;
509 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
511 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
512 if (r == (size_t)(-1))
514 cd->my_errno = YAZ_ICONV_EILSEQ;
520 size_t inbytesleft, no_read_sub = 0;
524 inp = (unsigned char *) utf8_buf;
525 inbytesleft = strlen(utf8_buf);
527 x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb);
530 *page_chr = ESC "(B";
533 x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb);
536 *page_chr = ESC "(B";
539 x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb);
545 x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb);
551 x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb);
554 *page_chr = ESC "(2";
557 x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb);
560 *page_chr = ESC "(N";
563 x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb);
566 *page_chr = ESC "(Q";
569 x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb);
572 *page_chr = ESC "(3";
575 x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb);
578 *page_chr = ESC "(4";
581 x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb);
584 *page_chr = ESC "(S";
587 x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb);
590 *page_chr = ESC "$1";
593 cd->my_errno = YAZ_ICONV_EILSEQ;
598 static size_t flush_combos(yaz_iconv_t cd,
599 char **outbuf, size_t *outbytesleft)
601 unsigned long y = cd->write_marc8_last;
609 assert(cd->write_marc8_lpage);
610 if (cd->write_marc8_lpage)
612 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
613 cd->write_marc8_lpage);
618 byte = (unsigned char )((y>>16) & 0xff);
620 out_buf[out_no++] = byte;
621 byte = (unsigned char)((y>>8) & 0xff);
623 out_buf[out_no++] = byte;
624 byte = (unsigned char )(y & 0xff);
626 out_buf[out_no++] = byte;
628 if (out_no + 2 >= *outbytesleft)
630 cd->my_errno = YAZ_ICONV_E2BIG;
631 return (size_t) (-1);
634 memcpy(*outbuf, out_buf, out_no);
636 (*outbytesleft) -= out_no;
637 if (cd->write_marc8_second_half_char)
639 *(*outbuf)++ = cd->write_marc8_second_half_char;
643 cd->write_marc8_last = 0;
644 cd->write_marc8_lpage = 0;
645 cd->write_marc8_second_half_char = 0;
649 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
650 char **outbuf, size_t *outbytesleft,
651 const char *page_chr)
653 const char **old_page_chr = &cd->write_marc8_g0;
655 /* are we going to a G1-set (such as such as ESC ")!E") */
656 if (page_chr && page_chr[1] == ')')
657 old_page_chr = &cd->write_marc8_g1;
659 if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
662 const char *page_out = page_chr;
664 if (*outbytesleft < 8)
666 cd->my_errno = YAZ_ICONV_E2BIG;
668 return (size_t) (-1);
673 if (!strcmp(*old_page_chr, ESC "p")
674 || !strcmp(*old_page_chr, ESC "g")
675 || !strcmp(*old_page_chr, ESC "b"))
678 /* Technique 1 leave */
679 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
681 /* Must leave script + enter new page */
682 plen = strlen(page_out);
683 memcpy(*outbuf, page_out, plen);
685 (*outbytesleft) -= plen;
690 *old_page_chr = page_chr;
691 plen = strlen(page_out);
692 memcpy(*outbuf, page_out, plen);
694 (*outbytesleft) -= plen;
700 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
701 char **outbuf, size_t *outbytesleft)
704 const char *page_chr = 0;
705 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
708 return (size_t) (-1);
714 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
720 cd->write_marc8_second_half_char = 0xEC;
721 else if (x == 0x0360)
722 cd->write_marc8_second_half_char = 0xFB;
724 if (*outbytesleft <= 1)
726 cd->my_errno = YAZ_ICONV_E2BIG;
727 return (size_t) (-1);
734 size_t r = flush_combos(cd, outbuf, outbytesleft);
738 cd->write_marc8_last = y;
739 cd->write_marc8_lpage = page_chr;
744 static size_t yaz_flush_marc8(yaz_iconv_t cd,
745 char **outbuf, size_t *outbytesleft)
747 size_t r = flush_combos(cd, outbuf, outbytesleft);
750 cd->write_marc8_g1 = 0;
751 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
754 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
755 char **outbuf, size_t *outbytesleft)
758 for (i = 0; latin1_comb[i].x1; i++)
760 if (x == latin1_comb[i].y)
763 /* save the output pointers .. */
764 char *outbuf0 = *outbuf;
765 size_t outbytesleft0 = *outbytesleft;
766 int last_ch = cd->write_marc8_last;
767 const char *lpage = cd->write_marc8_lpage;
769 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
770 outbuf, outbytesleft);
773 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
774 outbuf, outbytesleft);
775 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
777 /* not enough room. reset output to original values */
779 *outbytesleft = outbytesleft0;
780 cd->write_marc8_last = last_ch;
781 cd->write_marc8_lpage = lpage;
786 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
791 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
792 char **outbuf, size_t *outbytesleft)
794 unsigned char *outp = (unsigned char *) *outbuf;
796 if (*outbytesleft >= sizeof(wchar_t))
799 memcpy(outp, &wch, sizeof(wch));
801 (*outbytesleft) -= sizeof(wch);
805 cd->my_errno = YAZ_ICONV_E2BIG;
808 *outbuf = (char *) outp;
813 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
815 return cd->read_handle && cd->write_handle;
818 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
820 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
822 cd->write_handle = 0;
825 cd->flush_handle = 0;
826 cd->my_errno = YAZ_ICONV_UNKNOWN;
828 /* a useful hack: if fromcode has leading @,
829 the library not use YAZ's own conversions .. */
830 if (fromcode[0] == '@')
834 if (!yaz_matchstr(fromcode, "UTF8"))
836 cd->read_handle = yaz_read_UTF8;
837 cd->init_handle = yaz_init_UTF8;
839 else if (!yaz_matchstr(fromcode, "ISO88591"))
840 cd->read_handle = yaz_read_ISO8859_1;
841 else if (!yaz_matchstr(fromcode, "UCS4"))
842 cd->read_handle = yaz_read_UCS4;
843 else if (!yaz_matchstr(fromcode, "UCS4LE"))
844 cd->read_handle = yaz_read_UCS4LE;
845 else if (!yaz_matchstr(fromcode, "MARC8"))
846 cd->read_handle = yaz_read_marc8;
847 else if (!yaz_matchstr(fromcode, "MARC8s"))
848 cd->read_handle = yaz_read_marc8s;
849 else if (!yaz_matchstr(fromcode, "advancegreek"))
850 cd->read_handle = yaz_read_advancegreek;
851 else if (!yaz_matchstr(fromcode, "iso54281984"))
852 cd->read_handle = yaz_read_iso5428_1984;
853 else if (!yaz_matchstr(fromcode, "iso5428:1984"))
854 cd->read_handle = yaz_read_iso5428_1984;
856 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
857 cd->read_handle = yaz_read_wchar_t;
860 if (!yaz_matchstr(tocode, "UTF8"))
861 cd->write_handle = yaz_write_UTF8;
862 else if (!yaz_matchstr(tocode, "ISO88591"))
864 cd->write_handle = yaz_write_ISO8859_1;
865 cd->flush_handle = yaz_flush_ISO8859_1;
867 else if (!yaz_matchstr (tocode, "UCS4"))
868 cd->write_handle = yaz_write_UCS4;
869 else if (!yaz_matchstr(tocode, "UCS4LE"))
870 cd->write_handle = yaz_write_UCS4LE;
871 else if (!yaz_matchstr(tocode, "MARC8"))
873 cd->write_handle = yaz_write_marc8;
874 cd->flush_handle = yaz_flush_marc8;
876 else if (!yaz_matchstr(tocode, "MARC8s"))
878 cd->write_handle = yaz_write_marc8;
879 cd->flush_handle = yaz_flush_marc8;
881 else if (!yaz_matchstr(tocode, "advancegreek"))
883 cd->write_handle = yaz_write_advancegreek;
885 else if (!yaz_matchstr(tocode, "iso54281984"))
887 cd->write_handle = yaz_write_iso5428_1984;
889 else if (!yaz_matchstr(tocode, "iso5428:1984"))
891 cd->write_handle = yaz_write_iso5428_1984;
894 else if (!yaz_matchstr(tocode, "WCHAR_T"))
895 cd->write_handle = yaz_write_wchar_t;
900 if (!cd->read_handle || !cd->write_handle)
902 cd->iconv_cd = iconv_open (tocode, fromcode);
903 if (cd->iconv_cd == (iconv_t) (-1))
910 if (!cd->read_handle || !cd->write_handle)
920 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
921 char **outbuf, size_t *outbytesleft)
930 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
931 if (r == (size_t)(-1))
936 cd->my_errno = YAZ_ICONV_E2BIG;
939 cd->my_errno = YAZ_ICONV_EINVAL;
942 cd->my_errno = YAZ_ICONV_EILSEQ;
945 cd->my_errno = YAZ_ICONV_UNKNOWN;
957 cd->my_errno = YAZ_ICONV_UNKNOWN;
961 cd->comb_offset = cd->comb_size = 0;
962 cd->compose_char = 0;
964 cd->write_marc8_second_half_char = 0;
965 cd->write_marc8_last = 0;
966 cd->write_marc8_lpage = 0;
967 cd->write_marc8_g0 = ESC "(B";
968 cd->write_marc8_g1 = 0;
976 if (cd->init_handle && inbuf && *inbuf)
979 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
980 *inbytesleft, &no_read);
983 if (cd->my_errno == YAZ_ICONV_EINVAL)
988 *inbytesleft -= no_read;
994 if (!inbuf || !*inbuf)
996 if (outbuf && *outbuf)
999 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1000 if (cd->flush_handle)
1001 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1016 no_read = cd->no_read_x;
1020 if (*inbytesleft == 0)
1022 r = *inbuf - inbuf0;
1025 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1035 r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1038 /* unable to write it. save it because read_handle cannot
1040 if (cd->my_errno == YAZ_ICONV_E2BIG)
1043 cd->no_read_x = no_read;
1049 *inbytesleft -= no_read;
1050 (*inbuf) += no_read;
1055 int yaz_iconv_error (yaz_iconv_t cd)
1057 return cd->my_errno;
1060 int yaz_iconv_close (yaz_iconv_t cd)
1064 iconv_close (cd->iconv_cd);
1070 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1078 * indent-tabs-mode: nil
1080 * vim: shiftwidth=4 tabstop=8 expandtab