2 * Copyright (C) 1995-2008, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
36 #include <yaz/xmalloc.h>
38 #include <yaz/snprintf.h>
41 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining,
43 unsigned mask, int boffset);
46 yaz_conv_func_t yaz_marc8_42_conv;
47 yaz_conv_func_t yaz_marc8_45_conv;
48 yaz_conv_func_t yaz_marc8_67_conv;
49 yaz_conv_func_t yaz_marc8_62_conv;
50 yaz_conv_func_t yaz_marc8_70_conv;
51 yaz_conv_func_t yaz_marc8_32_conv;
52 yaz_conv_func_t yaz_marc8_4E_conv;
53 yaz_conv_func_t yaz_marc8_51_conv;
54 yaz_conv_func_t yaz_marc8_33_conv;
55 yaz_conv_func_t yaz_marc8_34_conv;
56 yaz_conv_func_t yaz_marc8_53_conv;
57 yaz_conv_func_t yaz_marc8_31_conv;
59 yaz_conv_func_t yaz_marc8r_42_conv;
60 yaz_conv_func_t yaz_marc8r_45_conv;
61 yaz_conv_func_t yaz_marc8r_67_conv;
62 yaz_conv_func_t yaz_marc8r_62_conv;
63 yaz_conv_func_t yaz_marc8r_70_conv;
64 yaz_conv_func_t yaz_marc8r_32_conv;
65 yaz_conv_func_t yaz_marc8r_4E_conv;
66 yaz_conv_func_t yaz_marc8r_51_conv;
67 yaz_conv_func_t yaz_marc8r_33_conv;
68 yaz_conv_func_t yaz_marc8r_34_conv;
69 yaz_conv_func_t yaz_marc8r_53_conv;
70 yaz_conv_func_t yaz_marc8r_31_conv;
72 struct yaz_iconv_struct {
75 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
76 size_t inbytesleft, size_t *no_read);
77 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
78 size_t inbytesleft, size_t *no_read);
79 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
80 char **outbuf, size_t *outbytesleft);
81 size_t (*flush_handle)(yaz_iconv_t cd,
82 char **outbuf, size_t *outbytesleft);
88 unsigned long comb_x[8];
89 size_t comb_no_read[8];
91 unsigned long unget_x;
95 unsigned long compose_char;
97 unsigned write_marc8_second_half_char;
98 unsigned long write_marc8_last;
100 const char *write_marc8_lpage;
101 const char *write_marc8_g0;
102 const char *write_marc8_g1;
107 unsigned long x1, x2;
110 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
111 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
112 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
113 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
114 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
116 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
117 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
118 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
119 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
120 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
121 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
122 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
123 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
124 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
125 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
126 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
127 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
128 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
129 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
130 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
131 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
132 /* omitted: 0xd7 MULTIPLICATION SIGN */
133 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
134 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
135 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
136 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
137 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
138 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
139 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
140 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
141 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
142 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
143 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
144 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
145 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
146 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
147 /* omitted: 0xe6 LATIN SMALL LETTER AE */
148 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
149 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
150 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
151 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
152 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
153 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
154 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
155 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
156 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
157 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
158 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
159 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
160 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
161 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
162 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
163 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
164 /* omitted: 0xf7 DIVISION SIGN */
165 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
166 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
167 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
168 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
169 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
170 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
171 /* omitted: 0xfe LATIN SMALL LETTER THORN */
172 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
179 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
180 char **outbuf, size_t *outbytesleft,
181 const char *page_chr);
183 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
184 size_t inbytesleft, size_t *no_read)
186 unsigned long x = inp[0];
194 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
195 size_t inbytesleft, size_t *no_read)
199 if (inbytesleft < sizeof(wchar_t))
201 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207 memcpy (&wch, inp, sizeof(wch));
209 *no_read = sizeof(wch);
216 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
217 size_t inbytesleft, size_t *no_read,
220 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
221 size_t inbytesleft, size_t *no_read)
224 if (cd->comb_offset < cd->comb_size)
226 *no_read = cd->comb_no_read[cd->comb_offset];
227 x = cd->comb_x[cd->comb_offset];
229 /* special case for double-diacritic combining characters,
230 INVERTED BREVE and DOUBLE TILDE.
231 We'll increment the no_read counter by 1, since we want to skip over
232 the processing of the closing ligature character
234 /* this code is no longer necessary.. our handlers code in
235 yaz_marc8_?_conv (generated by charconv.tcl) now returns
236 0 and no_read=1 when a sequence does not match the input.
237 The SECOND HALFs in codetables.xml produces a non-existant
238 entry in the conversion trie.. Hence when met, the input byte is
239 skipped as it should (in yaz_iconv)
242 if (x == 0x0361 || x == 0x0360)
250 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
254 if (inbytesleft == 0 && cd->comb_size)
256 cd->my_errno = YAZ_ICONV_EINVAL;
261 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
264 cd->comb_x[cd->comb_size] = x;
265 cd->comb_no_read[cd->comb_size] = *no_read;
267 inbytesleft = inbytesleft - *no_read;
272 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
273 size_t inbytesleft, size_t *no_read)
275 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
276 if (x && cd->comb_size == 1)
278 /* For MARC8s we try to get a Latin-1 page code out of it */
280 for (i = 0; latin1_comb[i].x1; i++)
281 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
283 *no_read += cd->comb_no_read[0];
285 x = latin1_comb[i].y;
292 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
293 size_t inbytesleft, size_t *no_read,
297 while (inbytesleft > 0 && *inp == 27)
299 int *modep = &cd->g0_mode;
300 size_t inbytesleft0 = inbytesleft;
304 if (inbytesleft == 0)
306 if (*inp == '$') /* set with multiple bytes */
311 if (inbytesleft == 0)
313 if (*inp == '(' || *inp == ',') /* G0 */
318 else if (*inp == ')' || *inp == '-') /* G1 */
322 modep = &cd->g1_mode;
324 if (inbytesleft == 0)
326 if (*inp == '!') /* ANSEL is a special case */
331 if (inbytesleft == 0)
333 *modep = *inp++; /* Final character */
336 (*no_read) += inbytesleft0 - inbytesleft;
338 if (inbytesleft == 0)
340 else if (*inp == ' ')
348 size_t no_read_sub = 0;
349 int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
354 case 'B': /* Basic ASCII */
355 case 's': /* ASCII */
356 x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
358 case 'E': /* ANSEL */
359 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
361 case 'g': /* Greek */
362 x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
364 case 'b': /* Subscripts */
365 x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
367 case 'p': /* Superscripts */
368 x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
370 case '2': /* Basic Hebrew */
371 x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
373 case 'N': /* Basic Cyrillic */
374 x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
376 case 'Q': /* Extended Cyrillic */
377 x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
379 case '3': /* Basic Arabic */
380 x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
382 case '4': /* Extended Arabic */
383 x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
385 case 'S': /* Greek */
386 x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
388 case '1': /* Chinese, Japanese, Korean (EACC) */
389 x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
393 cd->my_errno = YAZ_ICONV_EILSEQ;
396 *no_read += no_read_sub;
401 cd->my_errno = YAZ_ICONV_EINVAL;
405 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
406 char **outbuf, size_t *outbytesleft)
408 /* list of two char unicode sequence that, when combined, are
409 equivalent to single unicode chars that can be represented in
411 Regular iconv on Linux at least does not seem to convert these,
412 but since MARC-8 to UTF-8 generates these composed sequence
413 we get a better chance of a successful MARC-8 -> ISO-8859-1
415 unsigned char *outp = (unsigned char *) *outbuf;
417 if (cd->compose_char)
420 for (i = 0; latin1_comb[i].x1; i++)
421 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
423 x = latin1_comb[i].y;
426 if (*outbytesleft < 1)
427 { /* no room. Retain compose_char and bail out */
428 cd->my_errno = YAZ_ICONV_E2BIG;
431 if (!latin1_comb[i].x1)
432 { /* not found. Just write compose_char */
433 *outp++ = (unsigned char) cd->compose_char;
435 *outbuf = (char *) outp;
437 /* compose_char used so reset it. x now holds current char */
438 cd->compose_char = 0;
441 if (x > 32 && x < 127 && cd->compose_char == 0)
443 cd->compose_char = x;
446 else if (x > 255 || x < 1)
448 cd->my_errno = YAZ_ICONV_EILSEQ;
451 else if (*outbytesleft < 1)
453 cd->my_errno = YAZ_ICONV_E2BIG;
456 *outp++ = (unsigned char) x;
458 *outbuf = (char *) outp;
462 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
463 char **outbuf, size_t *outbytesleft)
465 if (cd->compose_char)
467 unsigned char *outp = (unsigned char *) *outbuf;
468 if (*outbytesleft < 1)
470 cd->my_errno = YAZ_ICONV_E2BIG;
473 *outp++ = (unsigned char) cd->compose_char;
475 *outbuf = (char *) outp;
476 cd->compose_char = 0;
481 static unsigned long lookup_marc8(yaz_iconv_t cd,
482 unsigned long x, int *comb,
483 const char **page_chr)
486 char *utf8_outbuf = utf8_buf;
487 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
489 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
490 if (r == (size_t)(-1))
492 cd->my_errno = YAZ_ICONV_EILSEQ;
498 size_t inbytesleft, no_read_sub = 0;
502 inp = (unsigned char *) utf8_buf;
503 inbytesleft = strlen(utf8_buf);
505 x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
508 *page_chr = ESC "(B";
511 x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
514 *page_chr = ESC "(B";
517 x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
523 x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
529 x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
532 *page_chr = ESC "(2";
535 x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
538 *page_chr = ESC "(N";
541 x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
544 *page_chr = ESC "(Q";
547 x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
550 *page_chr = ESC "(3";
553 x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
556 *page_chr = ESC "(4";
559 x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
562 *page_chr = ESC "(S";
565 x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
568 *page_chr = ESC "$1";
571 cd->my_errno = YAZ_ICONV_EILSEQ;
576 static size_t flush_combos(yaz_iconv_t cd,
577 char **outbuf, size_t *outbytesleft)
579 unsigned long y = cd->write_marc8_last;
584 assert(cd->write_marc8_lpage);
585 if (cd->write_marc8_lpage)
587 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
588 cd->write_marc8_lpage);
593 if (9 >= *outbytesleft)
595 cd->my_errno = YAZ_ICONV_E2BIG;
596 return (size_t) (-1);
598 if (cd->write_marc8_ncr)
600 yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
601 (*outbytesleft) -= 8;
611 byte = (unsigned char )((y>>16) & 0xff);
613 out_buf[out_no++] = byte;
614 byte = (unsigned char)((y>>8) & 0xff);
616 out_buf[out_no++] = byte;
617 byte = (unsigned char )(y & 0xff);
619 out_buf[out_no++] = byte;
620 memcpy(*outbuf, out_buf, out_no);
622 (*outbytesleft) -= out_no;
625 if (cd->write_marc8_second_half_char)
627 *(*outbuf)++ = cd->write_marc8_second_half_char;
631 cd->write_marc8_last = 0;
632 cd->write_marc8_ncr = 0;
633 cd->write_marc8_lpage = 0;
634 cd->write_marc8_second_half_char = 0;
638 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
639 char **outbuf, size_t *outbytesleft,
640 const char *page_chr)
642 const char **old_page_chr = &cd->write_marc8_g0;
644 /* are we going to a G1-set (such as such as ESC ")!E") */
645 if (page_chr && page_chr[1] == ')')
646 old_page_chr = &cd->write_marc8_g1;
648 if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
651 const char *page_out = page_chr;
653 if (*outbytesleft < 8)
655 cd->my_errno = YAZ_ICONV_E2BIG;
657 return (size_t) (-1);
662 if (!strcmp(*old_page_chr, ESC "p")
663 || !strcmp(*old_page_chr, ESC "g")
664 || !strcmp(*old_page_chr, ESC "b"))
667 /* Technique 1 leave */
668 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
670 /* Must leave script + enter new page */
671 plen = strlen(page_out);
672 memcpy(*outbuf, page_out, plen);
674 (*outbytesleft) -= plen;
679 *old_page_chr = page_chr;
680 plen = strlen(page_out);
681 memcpy(*outbuf, page_out, plen);
683 (*outbytesleft) -= plen;
689 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
690 char **outbuf, size_t *outbytesleft,
695 const char *page_chr = 0;
696 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
700 if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ)
701 return (size_t) (-1);
716 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
722 cd->write_marc8_second_half_char = 0xEC;
723 else if (x == 0x0360)
724 cd->write_marc8_second_half_char = 0xFB;
726 if (*outbytesleft <= 1)
728 cd->my_errno = YAZ_ICONV_E2BIG;
729 return (size_t) (-1);
736 size_t r = flush_combos(cd, outbuf, outbytesleft);
740 cd->write_marc8_last = y;
741 cd->write_marc8_lpage = page_chr;
742 cd->write_marc8_ncr = enable_ncr;
747 static size_t yaz_flush_marc8(yaz_iconv_t cd,
748 char **outbuf, size_t *outbytesleft)
750 size_t r = flush_combos(cd, outbuf, outbytesleft);
753 cd->write_marc8_g1 = 0;
754 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
757 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
758 char **outbuf, size_t *outbytesleft,
761 static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x,
762 char **outbuf, size_t *outbytesleft)
764 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0);
767 static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x,
768 char **outbuf, size_t *outbytesleft)
770 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1);
773 static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x,
774 char **outbuf, size_t *outbytesleft)
776 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2);
779 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
780 char **outbuf, size_t *outbytesleft,
784 for (i = 0; latin1_comb[i].x1; i++)
786 if (x == latin1_comb[i].y)
789 /* save the output pointers .. */
790 char *outbuf0 = *outbuf;
791 size_t outbytesleft0 = *outbytesleft;
792 int last_ch = cd->write_marc8_last;
793 const char *lpage = cd->write_marc8_lpage;
795 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
796 outbuf, outbytesleft, loss_mode);
799 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
800 outbuf, outbytesleft, loss_mode);
801 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
803 /* not enough room. reset output to original values */
805 *outbytesleft = outbytesleft0;
806 cd->write_marc8_last = last_ch;
807 cd->write_marc8_lpage = lpage;
812 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode);
817 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
818 char **outbuf, size_t *outbytesleft)
820 unsigned char *outp = (unsigned char *) *outbuf;
822 if (*outbytesleft >= sizeof(wchar_t))
825 memcpy(outp, &wch, sizeof(wch));
827 (*outbytesleft) -= sizeof(wch);
831 cd->my_errno = YAZ_ICONV_E2BIG;
834 *outbuf = (char *) outp;
839 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
841 return cd->read_handle && cd->write_handle;
844 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
846 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
848 cd->write_handle = 0;
851 cd->flush_handle = 0;
852 cd->my_errno = YAZ_ICONV_UNKNOWN;
854 /* a useful hack: if fromcode has leading @,
855 the library not use YAZ's own conversions .. */
856 if (fromcode[0] == '@')
860 if (!yaz_matchstr(fromcode, "UTF8"))
862 cd->read_handle = yaz_read_UTF8;
863 cd->init_handle = yaz_init_UTF8;
865 else if (!yaz_matchstr(fromcode, "ISO88591"))
866 cd->read_handle = yaz_read_ISO8859_1;
867 else if (!yaz_matchstr(fromcode, "UCS4"))
868 cd->read_handle = yaz_read_UCS4;
869 else if (!yaz_matchstr(fromcode, "UCS4LE"))
870 cd->read_handle = yaz_read_UCS4LE;
871 else if (!yaz_matchstr(fromcode, "MARC8"))
872 cd->read_handle = yaz_read_marc8;
873 else if (!yaz_matchstr(fromcode, "MARC8s"))
874 cd->read_handle = yaz_read_marc8s;
875 else if (!yaz_matchstr(fromcode, "advancegreek"))
876 cd->read_handle = yaz_read_advancegreek;
877 else if (!yaz_matchstr(fromcode, "iso54281984"))
878 cd->read_handle = yaz_read_iso5428_1984;
879 else if (!yaz_matchstr(fromcode, "iso5428:1984"))
880 cd->read_handle = yaz_read_iso5428_1984;
882 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
883 cd->read_handle = yaz_read_wchar_t;
886 if (!yaz_matchstr(tocode, "UTF8"))
887 cd->write_handle = yaz_write_UTF8;
888 else if (!yaz_matchstr(tocode, "ISO88591"))
890 cd->write_handle = yaz_write_ISO8859_1;
891 cd->flush_handle = yaz_flush_ISO8859_1;
893 else if (!yaz_matchstr (tocode, "UCS4"))
894 cd->write_handle = yaz_write_UCS4;
895 else if (!yaz_matchstr(tocode, "UCS4LE"))
896 cd->write_handle = yaz_write_UCS4LE;
897 else if (!yaz_matchstr(tocode, "MARC8"))
899 cd->write_handle = yaz_write_marc8_normal;
900 cd->flush_handle = yaz_flush_marc8;
902 else if (!yaz_matchstr(tocode, "MARC8s"))
904 cd->write_handle = yaz_write_marc8_normal;
905 cd->flush_handle = yaz_flush_marc8;
907 else if (!yaz_matchstr(tocode, "MARC8lossy"))
909 cd->write_handle = yaz_write_marc8_lossy;
910 cd->flush_handle = yaz_flush_marc8;
912 else if (!yaz_matchstr(tocode, "MARC8lossless"))
914 cd->write_handle = yaz_write_marc8_lossless;
915 cd->flush_handle = yaz_flush_marc8;
917 else if (!yaz_matchstr(tocode, "advancegreek"))
919 cd->write_handle = yaz_write_advancegreek;
921 else if (!yaz_matchstr(tocode, "iso54281984"))
923 cd->write_handle = yaz_write_iso5428_1984;
925 else if (!yaz_matchstr(tocode, "iso5428:1984"))
927 cd->write_handle = yaz_write_iso5428_1984;
930 else if (!yaz_matchstr(tocode, "WCHAR_T"))
931 cd->write_handle = yaz_write_wchar_t;
936 if (!cd->read_handle || !cd->write_handle)
938 cd->iconv_cd = iconv_open (tocode, fromcode);
939 if (cd->iconv_cd == (iconv_t) (-1))
946 if (!cd->read_handle || !cd->write_handle)
956 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
957 char **outbuf, size_t *outbytesleft)
966 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
967 if (r == (size_t)(-1))
972 cd->my_errno = YAZ_ICONV_E2BIG;
975 cd->my_errno = YAZ_ICONV_EINVAL;
978 cd->my_errno = YAZ_ICONV_EILSEQ;
981 cd->my_errno = YAZ_ICONV_UNKNOWN;
993 cd->my_errno = YAZ_ICONV_UNKNOWN;
997 cd->comb_offset = cd->comb_size = 0;
998 cd->compose_char = 0;
1000 cd->write_marc8_second_half_char = 0;
1001 cd->write_marc8_last = 0;
1002 cd->write_marc8_ncr = 0;
1003 cd->write_marc8_lpage = 0;
1004 cd->write_marc8_g0 = ESC "(B";
1005 cd->write_marc8_g1 = 0;
1013 if (cd->init_handle && inbuf && *inbuf)
1016 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1017 *inbytesleft, &no_read);
1020 if (cd->my_errno == YAZ_ICONV_EINVAL)
1025 *inbytesleft -= no_read;
1031 if (!inbuf || !*inbuf)
1033 if (outbuf && *outbuf)
1036 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1037 if (cd->flush_handle)
1038 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1053 no_read = cd->no_read_x;
1057 if (*inbytesleft == 0)
1059 r = *inbuf - inbuf0;
1062 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1072 r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1075 /* unable to write it. save it because read_handle cannot
1077 if (cd->my_errno == YAZ_ICONV_E2BIG)
1080 cd->no_read_x = no_read;
1086 *inbytesleft -= no_read;
1087 (*inbuf) += no_read;
1092 int yaz_iconv_error (yaz_iconv_t cd)
1094 return cd->my_errno;
1097 int yaz_iconv_close (yaz_iconv_t cd)
1101 iconv_close (cd->iconv_cd);
1107 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1115 * indent-tabs-mode: nil
1117 * vim: shiftwidth=4 tabstop=8 expandtab