2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.24 2006-08-04 14:35:40 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71 size_t *no_read, int *combining);
73 struct yaz_iconv_struct {
76 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79 size_t inbytesleft, size_t *no_read);
80 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81 char **outbuf, size_t *outbytesleft,
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned long write_marc8_comb_ch[8];
97 size_t write_marc8_comb_no;
98 unsigned long write_marc8_last;
99 const char *write_marc8_page_chr;
103 unsigned long x1, x2;
106 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
107 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
108 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
109 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
110 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
111 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
112 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
113 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
114 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
115 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
116 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
117 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
118 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
119 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
120 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
121 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
122 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
123 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
124 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
125 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
126 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
127 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
128 /* omitted: 0xd7 MULTIPLICATION SIGN */
129 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
130 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
131 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
132 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
133 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
134 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
135 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
136 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
137 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
138 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
139 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
140 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
141 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
142 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
143 /* omitted: 0xe6 LATIN SMALL LETTER AE */
144 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
145 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
146 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
147 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
148 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
149 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
150 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
151 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
152 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
153 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
154 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
155 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
156 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
157 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
158 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
159 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
160 /* omitted: 0xf7 DIVISION SIGN */
161 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
162 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
163 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
164 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
165 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
166 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
167 /* omitted: 0xfe LATIN SMALL LETTER THORN */
168 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
173 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
174 size_t inbytesleft, size_t *no_read)
176 unsigned long x = inp[0];
181 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
191 cd->my_errno = YAZ_ICONV_EINVAL;
194 if (inp[1] != 0xbb && inp[2] == 0xbf)
201 unsigned long yaz_read_UTF8_char(unsigned char *inp,
202 size_t inbytesleft, size_t *no_read,
212 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
215 *error = YAZ_ICONV_EILSEQ;
217 else if (inp[0] <= 0xdf && inbytesleft >= 2)
219 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
225 *error = YAZ_ICONV_EILSEQ;
228 else if (inp[0] <= 0xef && inbytesleft >= 3)
230 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
237 *error = YAZ_ICONV_EILSEQ;
240 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
242 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
243 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
249 *error = YAZ_ICONV_EILSEQ;
252 else if (inp[0] <= 0xfb && inbytesleft >= 5)
254 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
255 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
262 *error = YAZ_ICONV_EILSEQ;
265 else if (inp[0] <= 0xfd && inbytesleft >= 6)
267 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
268 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
269 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
275 *error = YAZ_ICONV_EILSEQ;
281 *error = YAZ_ICONV_EINVAL;
286 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
287 size_t inbytesleft, size_t *no_read)
289 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
292 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
293 size_t inbytesleft, size_t *no_read)
299 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
304 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
310 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
311 size_t inbytesleft, size_t *no_read)
317 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
322 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
329 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
330 size_t inbytesleft, size_t *no_read)
334 if (inbytesleft < sizeof(wchar_t))
336 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
342 memcpy (&wch, inp, sizeof(wch));
344 *no_read = sizeof(wch);
351 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
352 size_t inbytesleft, size_t *no_read,
355 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
356 size_t inbytesleft, size_t *no_read)
359 if (cd->comb_offset < cd->comb_size)
361 *no_read = cd->comb_no_read[cd->comb_offset];
362 x = cd->comb_x[cd->comb_offset];
364 /* special case for double-diacritic combining characters,
365 INVERTED BREVE and DOUBLE TILDE.
366 We'll increment the no_read counter by 1, since we want to skip over
367 the processing of the closing ligature character
369 /* this code is no longer necessary.. our handlers code in
370 yaz_marc8_?_conv (generated by charconv.tcl) now returns
371 0 and no_read=1 when a sequence does not match the input.
372 The SECOND HALFs in codetables.xml produces a non-existant
373 entry in the conversion trie.. Hence when met, the input byte is
374 skipped as it should (in yaz_iconv)
377 if (x == 0x0361 || x == 0x0360)
385 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
388 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
391 cd->comb_x[cd->comb_size] = x;
392 cd->comb_no_read[cd->comb_size] = *no_read;
394 inbytesleft = inbytesleft - *no_read;
399 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
400 size_t inbytesleft, size_t *no_read)
402 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
403 if (x && cd->comb_size == 1)
405 /* For MARC8s we try to get a Latin-1 page code out of it */
407 for (i = 0; latin1_comb[i].x1; i++)
408 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
410 *no_read += cd->comb_no_read[0];
412 x = latin1_comb[i].y;
419 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
420 size_t inbytesleft, size_t *no_read,
424 while(inbytesleft >= 1 && inp[0] == 27)
426 size_t inbytesleft0 = inbytesleft;
429 while(inbytesleft > 0 && strchr("(,$!", *inp))
434 if (inbytesleft <= 0)
437 cd->my_errno = YAZ_ICONV_EINVAL;
440 cd->marc8_esc_mode = *inp++;
442 (*no_read) += inbytesleft0 - inbytesleft;
444 if (inbytesleft <= 0)
449 size_t no_read_sub = 0;
452 switch(cd->marc8_esc_mode)
454 case 'B': /* Basic ASCII */
455 case 'E': /* ANSEL */
456 case 's': /* ASCII */
457 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
459 case 'g': /* Greek */
460 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
462 case 'b': /* Subscripts */
463 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
465 case 'p': /* Superscripts */
466 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
468 case '2': /* Basic Hebrew */
469 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
471 case 'N': /* Basic Cyrillic */
472 case 'Q': /* Extended Cyrillic */
473 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
475 case '3': /* Basic Arabic */
476 case '4': /* Extended Arabic */
477 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
479 case 'S': /* Greek */
480 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
482 case '1': /* Chinese, Japanese, Korean (EACC) */
483 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
487 cd->my_errno = YAZ_ICONV_EILSEQ;
490 *no_read += no_read_sub;
495 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
496 char **outbuf, size_t *outbytesleft,
499 unsigned char *outp = (unsigned char *) *outbuf;
501 if (x <= 0x7f && *outbytesleft >= 1)
503 *outp++ = (unsigned char) x;
506 else if (x <= 0x7ff && *outbytesleft >= 2)
508 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
509 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
510 (*outbytesleft) -= 2;
512 else if (x <= 0xffff && *outbytesleft >= 3)
514 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
515 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
516 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
517 (*outbytesleft) -= 3;
519 else if (x <= 0x1fffff && *outbytesleft >= 4)
521 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
522 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
523 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
524 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
525 (*outbytesleft) -= 4;
527 else if (x <= 0x3ffffff && *outbytesleft >= 5)
529 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
530 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
531 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
532 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
533 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
534 (*outbytesleft) -= 5;
536 else if (*outbytesleft >= 6)
538 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
539 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
540 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
541 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
542 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
543 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
544 (*outbytesleft) -= 6;
548 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
551 *outbuf = (char *) outp;
556 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
557 char **outbuf, size_t *outbytesleft,
560 /* list of two char unicode sequence that, when combined, are
561 equivalent to single unicode chars that can be represented in
563 Regular iconv on Linux at least does not seem to convert these,
564 but since MARC-8 to UTF-8 generates these composed sequence
565 we get a better chance of a successful MARC-8 -> ISO-8859-1
567 unsigned char *outp = (unsigned char *) *outbuf;
569 if (cd->compose_char)
572 for (i = 0; latin1_comb[i].x1; i++)
573 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
575 x = latin1_comb[i].y;
578 if (*outbytesleft < 1)
579 { /* no room. Retain compose_char and bail out */
580 cd->my_errno = YAZ_ICONV_E2BIG;
583 if (!latin1_comb[i].x1)
584 { /* not found. Just write compose_char */
585 *outp++ = (unsigned char) cd->compose_char;
587 *outbuf = (char *) outp;
589 /* compose_char used so reset it. x now holds current char */
590 cd->compose_char = 0;
593 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
595 cd->compose_char = x;
598 else if (x > 255 || x < 1)
600 cd->my_errno = YAZ_ICONV_EILSEQ;
603 else if (*outbytesleft < 1)
605 cd->my_errno = YAZ_ICONV_E2BIG;
608 *outp++ = (unsigned char) x;
610 *outbuf = (char *) outp;
615 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
616 char **outbuf, size_t *outbytesleft,
619 unsigned char *outp = (unsigned char *) *outbuf;
620 if (*outbytesleft >= 4)
622 *outp++ = (unsigned char) (x>>24);
623 *outp++ = (unsigned char) (x>>16);
624 *outp++ = (unsigned char) (x>>8);
625 *outp++ = (unsigned char) x;
626 (*outbytesleft) -= 4;
630 cd->my_errno = YAZ_ICONV_E2BIG;
633 *outbuf = (char *) outp;
637 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
638 char **outbuf, size_t *outbytesleft,
641 unsigned char *outp = (unsigned char *) *outbuf;
642 if (*outbytesleft >= 4)
644 *outp++ = (unsigned char) x;
645 *outp++ = (unsigned char) (x>>8);
646 *outp++ = (unsigned char) (x>>16);
647 *outp++ = (unsigned char) (x>>24);
648 (*outbytesleft) -= 4;
652 cd->my_errno = YAZ_ICONV_E2BIG;
655 *outbuf = (char *) outp;
659 static unsigned long lookup_marc8(yaz_iconv_t cd,
660 unsigned long x, int *comb,
661 const char **page_chr)
664 char *utf8_outbuf = utf8_buf;
665 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
667 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
668 if (r == (size_t)(-1))
670 cd->my_errno = YAZ_ICONV_EILSEQ;
676 size_t inbytesleft, no_read_sub = 0;
680 inp = (unsigned char *) utf8_buf;
681 inbytesleft = strlen(utf8_buf);
683 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
686 *page_chr = "\033(B";
689 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
695 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
701 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
707 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
710 *page_chr = "\033(2";
713 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
716 *page_chr = "\033(N";
719 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
722 *page_chr = "\033(3";
725 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
728 *page_chr = "\033(S";
731 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
734 *page_chr = "\033(1";
737 cd->my_errno = YAZ_ICONV_EILSEQ;
742 static size_t flush_combos(yaz_iconv_t cd,
743 char **outbuf, size_t *outbytesleft)
745 unsigned long y = cd->write_marc8_last;
746 unsigned char byte, second_half = 0;
748 size_t i, out_no = 0;
753 byte = (unsigned char )((y>>16) & 0xff);
755 out_buf[out_no++] = byte;
756 byte = (unsigned char)((y>>8) & 0xff);
758 out_buf[out_no++] = byte;
759 byte = (unsigned char )(y & 0xff);
761 out_buf[out_no++] = byte;
763 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
765 cd->my_errno = YAZ_ICONV_E2BIG;
766 return (size_t) (-1);
769 for (i = 0; i < cd->write_marc8_comb_no; i++)
771 /* all MARC-8 combined characters are simple bytes */
772 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
775 else if (byte == 0xFA)
781 memcpy(*outbuf, out_buf, out_no);
783 (*outbytesleft) -= out_no;
786 *(*outbuf)++ = second_half;
790 cd->write_marc8_last = 0;
791 cd->write_marc8_comb_no = 0;
795 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
796 char **outbuf, size_t *outbytesleft,
800 const char *page_chr = 0;
801 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
804 return (size_t) (-1);
808 if (cd->write_marc8_comb_no < 6)
809 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
813 size_t r = flush_combos(cd, outbuf, outbytesleft);
816 if (strcmp(page_chr, cd->write_marc8_page_chr))
818 size_t plen = strlen(page_chr);
820 if (*outbytesleft < plen)
822 cd->my_errno = YAZ_ICONV_E2BIG;
823 return (size_t) (-1);
825 memcpy(*outbuf, page_chr, plen);
827 (*outbytesleft) -= plen;
828 cd->write_marc8_page_chr = page_chr;
830 cd->write_marc8_last = y;
834 size_t r = flush_combos(cd, outbuf, outbytesleft);
838 cd->write_marc8_comb_no--;
840 cd->write_marc8_last = 0;
847 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
848 char **outbuf, size_t *outbytesleft,
852 for (i = 0; latin1_comb[i].x1; i++)
854 if (x == latin1_comb[i].y)
857 /* save the output pointers .. */
858 char *outbuf0 = *outbuf;
859 size_t outbytesleft0 = *outbytesleft;
860 int last_ch = cd->write_marc8_last;
862 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
863 outbuf, outbytesleft, 0);
866 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
867 outbuf, outbytesleft, last);
868 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
870 /* not enough room. reset output to original values */
872 *outbytesleft = outbytesleft0;
873 cd->write_marc8_last = last_ch;
878 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
883 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
884 char **outbuf, size_t *outbytesleft,
887 unsigned char *outp = (unsigned char *) *outbuf;
889 if (*outbytesleft >= sizeof(wchar_t))
892 memcpy(outp, &wch, sizeof(wch));
894 (*outbytesleft) -= sizeof(wch);
898 cd->my_errno = YAZ_ICONV_E2BIG;
901 *outbuf = (char *) outp;
906 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
908 return cd->read_handle && cd->write_handle;
911 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
913 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
915 cd->write_handle = 0;
918 cd->my_errno = YAZ_ICONV_UNKNOWN;
919 cd->marc8_esc_mode = 'B';
920 cd->comb_offset = cd->comb_size = 0;
921 cd->compose_char = 0;
923 cd->write_marc8_comb_no = 0;
924 cd->write_marc8_last = 0;
925 cd->write_marc8_page_chr = "\033(B";
927 /* a useful hack: if fromcode has leading @,
928 the library not use YAZ's own conversions .. */
929 if (fromcode[0] == '@')
933 if (!yaz_matchstr(fromcode, "UTF8"))
935 cd->read_handle = yaz_read_UTF8;
936 cd->init_handle = yaz_init_UTF8;
938 else if (!yaz_matchstr(fromcode, "ISO88591"))
939 cd->read_handle = yaz_read_ISO8859_1;
940 else if (!yaz_matchstr(fromcode, "UCS4"))
941 cd->read_handle = yaz_read_UCS4;
942 else if (!yaz_matchstr(fromcode, "UCS4LE"))
943 cd->read_handle = yaz_read_UCS4LE;
944 else if (!yaz_matchstr(fromcode, "MARC8"))
945 cd->read_handle = yaz_read_marc8;
946 else if (!yaz_matchstr(fromcode, "MARC8s"))
947 cd->read_handle = yaz_read_marc8s;
949 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
950 cd->read_handle = yaz_read_wchar_t;
953 if (!yaz_matchstr(tocode, "UTF8"))
954 cd->write_handle = yaz_write_UTF8;
955 else if (!yaz_matchstr(tocode, "ISO88591"))
956 cd->write_handle = yaz_write_ISO8859_1;
957 else if (!yaz_matchstr (tocode, "UCS4"))
958 cd->write_handle = yaz_write_UCS4;
959 else if (!yaz_matchstr(tocode, "UCS4LE"))
960 cd->write_handle = yaz_write_UCS4LE;
961 else if (!yaz_matchstr(tocode, "MARC8"))
962 cd->write_handle = yaz_write_marc8;
963 else if (!yaz_matchstr(tocode, "MARC8s"))
964 cd->write_handle = yaz_write_marc8;
966 else if (!yaz_matchstr(tocode, "WCHAR_T"))
967 cd->write_handle = yaz_write_wchar_t;
972 if (!cd->read_handle || !cd->write_handle)
974 cd->iconv_cd = iconv_open (tocode, fromcode);
975 if (cd->iconv_cd == (iconv_t) (-1))
982 if (!cd->read_handle || !cd->write_handle)
992 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
993 char **outbuf, size_t *outbytesleft)
1002 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1003 if (r == (size_t)(-1))
1005 switch (yaz_errno())
1008 cd->my_errno = YAZ_ICONV_E2BIG;
1011 cd->my_errno = YAZ_ICONV_EINVAL;
1014 cd->my_errno = YAZ_ICONV_EILSEQ;
1017 cd->my_errno = YAZ_ICONV_UNKNOWN;
1023 if (inbuf == 0 || *inbuf == 0)
1026 cd->my_errno = YAZ_ICONV_UNKNOWN;
1033 if (cd->init_handle)
1036 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1037 *inbytesleft, &no_read);
1040 if (cd->my_errno == YAZ_ICONV_EINVAL)
1045 *inbytesleft -= no_read;
1057 if (*inbytesleft == 0)
1059 r = *inbuf - inbuf0;
1064 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1075 no_read = cd->no_read_x;
1079 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1080 (*inbytesleft - no_read) == 0 ? 1 : 0);
1083 /* unable to write it. save it because read_handle cannot
1085 if (cd->my_errno == YAZ_ICONV_E2BIG)
1088 cd->no_read_x = no_read;
1094 *inbytesleft -= no_read;
1095 (*inbuf) += no_read;
1100 int yaz_iconv_error (yaz_iconv_t cd)
1102 return cd->my_errno;
1105 int yaz_iconv_close (yaz_iconv_t cd)
1109 iconv_close (cd->iconv_cd);
1118 * indent-tabs-mode: nil
1120 * vim: shiftwidth=4 tabstop=8 expandtab