2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.20 2006-04-19 23:46:15 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71 size_t *no_read, int *combining);
73 struct yaz_iconv_struct {
76 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79 size_t inbytesleft, size_t *no_read);
80 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81 char **outbuf, size_t *outbytesleft,
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned long write_marc8_comb_ch[8];
97 size_t write_marc8_comb_no;
98 unsigned long write_marc8_last;
99 const char *write_marc8_page_chr;
102 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
103 size_t inbytesleft, size_t *no_read)
105 unsigned long x = inp[0];
110 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
111 size_t inbytesleft, size_t *no_read)
120 cd->my_errno = YAZ_ICONV_EINVAL;
123 if (inp[1] != 0xbb && inp[2] == 0xbf)
130 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
131 size_t inbytesleft, size_t *no_read)
140 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
143 cd->my_errno = YAZ_ICONV_EILSEQ;
145 else if (inp[0] <= 0xdf && inbytesleft >= 2)
147 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
153 cd->my_errno = YAZ_ICONV_EILSEQ;
156 else if (inp[0] <= 0xef && inbytesleft >= 3)
158 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
165 cd->my_errno = YAZ_ICONV_EILSEQ;
168 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
170 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
171 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
177 cd->my_errno = YAZ_ICONV_EILSEQ;
180 else if (inp[0] <= 0xfb && inbytesleft >= 5)
182 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
183 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
190 cd->my_errno = YAZ_ICONV_EILSEQ;
193 else if (inp[0] <= 0xfd && inbytesleft >= 6)
195 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
196 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
197 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
203 cd->my_errno = YAZ_ICONV_EILSEQ;
209 cd->my_errno = YAZ_ICONV_EINVAL;
214 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
215 size_t inbytesleft, size_t *no_read)
221 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
226 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
232 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
233 size_t inbytesleft, size_t *no_read)
239 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
244 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
251 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
252 size_t inbytesleft, size_t *no_read)
256 if (inbytesleft < sizeof(wchar_t))
258 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
264 memcpy (&wch, inp, sizeof(wch));
266 *no_read = sizeof(wch);
273 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
274 size_t inbytesleft, size_t *no_read,
277 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
278 size_t inbytesleft, size_t *no_read)
281 if (cd->comb_offset < cd->comb_size)
283 *no_read = cd->comb_no_read[cd->comb_offset];
284 x = cd->comb_x[cd->comb_offset];
286 /* special case for double-diacritic combining characters,
287 INVERTED BREVE and DOUBLE TILDE.
288 We'll increment the no_read counter by 1, since we want to skip over
289 the processing of the closing ligature character
291 /* this code is no longer necessary.. our handlers code in
292 yaz_marc8_?_conv (generated by charconv.tcl) now returns
293 0 and no_read=1 when a sequence does not match the input.
294 The SECOND HALFs in codetables.xml produces a non-existant
295 entry in the conversion trie.. Hence when met, the input byte is
296 skipped as it should (in yaz_iconv)
299 if (x == 0x0361 || x == 0x0360)
307 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
310 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
313 cd->comb_x[cd->comb_size] = x;
314 cd->comb_no_read[cd->comb_size] = *no_read;
316 inbytesleft = inbytesleft - *no_read;
321 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
322 size_t inbytesleft, size_t *no_read,
326 while(inbytesleft >= 1 && inp[0] == 27)
328 size_t inbytesleft0 = inbytesleft;
331 while(inbytesleft > 0 && strchr("(,$!", *inp))
336 if (inbytesleft <= 0)
339 cd->my_errno = YAZ_ICONV_EINVAL;
342 cd->marc8_esc_mode = *inp++;
344 (*no_read) += inbytesleft0 - inbytesleft;
346 if (inbytesleft <= 0)
351 size_t no_read_sub = 0;
354 switch(cd->marc8_esc_mode)
356 case 'B': /* Basic ASCII */
357 case 'E': /* ANSEL */
358 case 's': /* ASCII */
359 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
361 case 'g': /* Greek */
362 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
364 case 'b': /* Subscripts */
365 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
367 case 'p': /* Superscripts */
368 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
370 case '2': /* Basic Hebrew */
371 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
373 case 'N': /* Basic Cyrillic */
374 case 'Q': /* Extended Cyrillic */
375 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
377 case '3': /* Basic Arabic */
378 case '4': /* Extended Arabic */
379 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
381 case 'S': /* Greek */
382 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
384 case '1': /* Chinese, Japanese, Korean (EACC) */
385 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
389 cd->my_errno = YAZ_ICONV_EILSEQ;
392 *no_read += no_read_sub;
397 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
398 char **outbuf, size_t *outbytesleft,
401 unsigned char *outp = (unsigned char *) *outbuf;
403 if (x <= 0x7f && *outbytesleft >= 1)
405 *outp++ = (unsigned char) x;
408 else if (x <= 0x7ff && *outbytesleft >= 2)
410 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
411 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
412 (*outbytesleft) -= 2;
414 else if (x <= 0xffff && *outbytesleft >= 3)
416 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
417 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
418 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
419 (*outbytesleft) -= 3;
421 else if (x <= 0x1fffff && *outbytesleft >= 4)
423 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
424 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
425 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
426 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
427 (*outbytesleft) -= 4;
429 else if (x <= 0x3ffffff && *outbytesleft >= 5)
431 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
432 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
433 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
434 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
435 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
436 (*outbytesleft) -= 5;
438 else if (*outbytesleft >= 6)
440 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
441 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
442 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
443 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
444 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
445 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
446 (*outbytesleft) -= 6;
450 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
453 *outbuf = (char *) outp;
458 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
459 char **outbuf, size_t *outbytesleft,
462 /* list of two char unicode sequence that, when combined, are
463 equivalent to single unicode chars that can be represented in
465 Regular iconv on Linux at least does not seem to convert these,
466 but since MARC-8 to UTF-8 generates these composed sequence
467 we get a better chance of a successful MARC-8 -> ISO-8859-1
470 unsigned long x1, x2;
473 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
474 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
475 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
476 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
477 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
478 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
479 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
480 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
481 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
482 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
483 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
484 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
485 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
486 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
487 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
488 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
489 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
490 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
491 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
492 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
493 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
494 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
495 /* omitted: 0xd7 MULTIPLICATION SIGN */
496 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
497 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
498 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
499 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
500 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
501 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
502 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
503 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
504 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
505 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
506 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
507 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
508 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
509 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
510 /* omitted: 0xe6 LATIN SMALL LETTER AE */
511 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
512 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
513 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
514 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
515 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
516 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
517 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
518 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
519 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
520 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
521 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
522 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
523 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
524 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
525 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
526 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
527 /* omitted: 0xf7 DIVISION SIGN */
528 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
529 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
530 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
531 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
532 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
533 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
534 /* omitted: 0xfe LATIN SMALL LETTER THORN */
535 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
539 unsigned char *outp = (unsigned char *) *outbuf;
541 if (cd->compose_char)
544 for (i = 0; latin1_comb[i].x1; i++)
545 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
547 x = latin1_comb[i].y;
550 if (*outbytesleft < 1)
551 { /* no room. Retain compose_char and bail out */
552 cd->my_errno = YAZ_ICONV_E2BIG;
555 if (!latin1_comb[i].x1)
556 { /* not found. Just write compose_char */
557 *outp++ = (unsigned char) cd->compose_char;
559 *outbuf = (char *) outp;
561 /* compose_char used so reset it. x now holds current char */
562 cd->compose_char = 0;
565 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
567 cd->compose_char = x;
570 else if (x > 255 || x < 1)
572 cd->my_errno = YAZ_ICONV_EILSEQ;
575 else if (*outbytesleft < 1)
577 cd->my_errno = YAZ_ICONV_E2BIG;
580 *outp++ = (unsigned char) x;
582 *outbuf = (char *) outp;
587 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
588 char **outbuf, size_t *outbytesleft,
591 unsigned char *outp = (unsigned char *) *outbuf;
592 if (*outbytesleft >= 4)
594 *outp++ = (unsigned char) (x>>24);
595 *outp++ = (unsigned char) (x>>16);
596 *outp++ = (unsigned char) (x>>8);
597 *outp++ = (unsigned char) x;
598 (*outbytesleft) -= 4;
602 cd->my_errno = YAZ_ICONV_E2BIG;
605 *outbuf = (char *) outp;
609 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
610 char **outbuf, size_t *outbytesleft,
613 unsigned char *outp = (unsigned char *) *outbuf;
614 if (*outbytesleft >= 4)
616 *outp++ = (unsigned char) x;
617 *outp++ = (unsigned char) (x>>8);
618 *outp++ = (unsigned char) (x>>16);
619 *outp++ = (unsigned char) (x>>24);
620 (*outbytesleft) -= 4;
624 cd->my_errno = YAZ_ICONV_E2BIG;
627 *outbuf = (char *) outp;
631 static unsigned long lookup_marc8(yaz_iconv_t cd,
632 unsigned long x, int *comb,
633 const char **page_chr)
636 char *utf8_outbuf = utf8_buf;
637 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
639 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
640 if (r == (size_t)(-1))
642 cd->my_errno = YAZ_ICONV_EILSEQ;
648 size_t inbytesleft, no_read_sub = 0;
652 inp = (unsigned char *) utf8_buf;
653 inbytesleft = strlen(utf8_buf);
655 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
658 *page_chr = "\033(B";
661 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
667 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
673 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
679 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
682 *page_chr = "\033(2";
685 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
688 *page_chr = "\033(N";
691 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
694 *page_chr = "\033(3";
697 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
700 *page_chr = "\033(S";
703 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
706 *page_chr = "\033(1";
709 cd->my_errno = YAZ_ICONV_EILSEQ;
714 static size_t flush_combos(yaz_iconv_t cd,
715 char **outbuf, size_t *outbytesleft)
717 unsigned long y = cd->write_marc8_last;
718 unsigned char byte, second_half = 0;
720 size_t i, out_no = 0;
725 byte = (unsigned char )((y>>16) & 0xff);
727 out_buf[out_no++] = byte;
728 byte = (unsigned char)((y>>8) & 0xff);
730 out_buf[out_no++] = byte;
731 byte = (unsigned char )(y & 0xff);
733 out_buf[out_no++] = byte;
735 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
737 cd->my_errno = YAZ_ICONV_E2BIG;
738 return (size_t) (-1);
741 for (i = 0; i < cd->write_marc8_comb_no; i++)
743 byte = cd->write_marc8_comb_ch[i];
746 else if (byte == 0xFA)
752 memcpy(*outbuf, out_buf, out_no);
754 (*outbytesleft) -= out_no;
757 *(*outbuf)++ = second_half;
761 cd->write_marc8_last = 0;
762 cd->write_marc8_comb_no = 0;
766 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
767 char **outbuf, size_t *outbytesleft,
771 const char *page_chr = 0;
772 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
775 return (size_t) (-1);
779 if (cd->write_marc8_comb_no < 6)
780 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
784 size_t r = flush_combos(cd, outbuf, outbytesleft);
787 if (strcmp(page_chr, cd->write_marc8_page_chr))
789 size_t plen = strlen(page_chr);
791 if (*outbytesleft < plen)
793 cd->my_errno = YAZ_ICONV_E2BIG;
794 return (size_t) (-1);
796 memcpy(*outbuf, page_chr, plen);
798 (*outbytesleft) -= plen;
799 cd->write_marc8_page_chr = page_chr;
801 cd->write_marc8_last = y;
805 size_t r = flush_combos(cd, outbuf, outbytesleft);
809 cd->write_marc8_comb_no--;
811 cd->write_marc8_last = 0;
819 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
820 char **outbuf, size_t *outbytesleft,
823 unsigned char *outp = (unsigned char *) *outbuf;
825 if (*outbytesleft >= sizeof(wchar_t))
828 memcpy(outp, &wch, sizeof(wch));
830 (*outbytesleft) -= sizeof(wch);
834 cd->my_errno = YAZ_ICONV_E2BIG;
837 *outbuf = (char *) outp;
842 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
844 return cd->read_handle && cd->write_handle;
847 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
849 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
851 cd->write_handle = 0;
854 cd->my_errno = YAZ_ICONV_UNKNOWN;
855 cd->marc8_esc_mode = 'B';
856 cd->comb_offset = cd->comb_size = 0;
857 cd->compose_char = 0;
859 cd->write_marc8_comb_no = 0;
860 cd->write_marc8_last = 0;
861 cd->write_marc8_page_chr = "\033(B";
863 /* a useful hack: if fromcode has leading @,
864 the library not use YAZ's own conversions .. */
865 if (fromcode[0] == '@')
869 if (!yaz_matchstr(fromcode, "UTF8"))
871 cd->read_handle = yaz_read_UTF8;
872 cd->init_handle = yaz_init_UTF8;
874 else if (!yaz_matchstr(fromcode, "ISO88591"))
875 cd->read_handle = yaz_read_ISO8859_1;
876 else if (!yaz_matchstr(fromcode, "UCS4"))
877 cd->read_handle = yaz_read_UCS4;
878 else if (!yaz_matchstr(fromcode, "UCS4LE"))
879 cd->read_handle = yaz_read_UCS4LE;
880 else if (!yaz_matchstr(fromcode, "MARC8"))
881 cd->read_handle = yaz_read_marc8;
883 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
884 cd->read_handle = yaz_read_wchar_t;
887 if (!yaz_matchstr(tocode, "UTF8"))
888 cd->write_handle = yaz_write_UTF8;
889 else if (!yaz_matchstr(tocode, "ISO88591"))
890 cd->write_handle = yaz_write_ISO8859_1;
891 else if (!yaz_matchstr (tocode, "UCS4"))
892 cd->write_handle = yaz_write_UCS4;
893 else if (!yaz_matchstr(tocode, "UCS4LE"))
894 cd->write_handle = yaz_write_UCS4LE;
895 else if (!yaz_matchstr(tocode, "MARC8"))
896 cd->write_handle = yaz_write_marc8;
898 else if (!yaz_matchstr(tocode, "WCHAR_T"))
899 cd->write_handle = yaz_write_wchar_t;
904 if (!cd->read_handle || !cd->write_handle)
906 cd->iconv_cd = iconv_open (tocode, fromcode);
907 if (cd->iconv_cd == (iconv_t) (-1))
914 if (!cd->read_handle || !cd->write_handle)
924 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
925 char **outbuf, size_t *outbytesleft)
934 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
935 if (r == (size_t)(-1))
940 cd->my_errno = YAZ_ICONV_E2BIG;
943 cd->my_errno = YAZ_ICONV_EINVAL;
946 cd->my_errno = YAZ_ICONV_EILSEQ;
949 cd->my_errno = YAZ_ICONV_UNKNOWN;
955 if (inbuf == 0 || *inbuf == 0)
958 cd->my_errno = YAZ_ICONV_UNKNOWN;
968 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
969 *inbytesleft, &no_read);
972 if (cd->my_errno == YAZ_ICONV_EINVAL)
977 *inbytesleft -= no_read;
989 if (*inbytesleft == 0)
996 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1007 no_read = cd->no_read_x;
1011 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1012 (*inbytesleft - no_read) == 0 ? 1 : 0);
1015 /* unable to write it. save it because read_handle cannot
1017 if (cd->my_errno == YAZ_ICONV_E2BIG)
1020 cd->no_read_x = no_read;
1026 *inbytesleft -= no_read;
1027 (*inbuf) += no_read;
1032 int yaz_iconv_error (yaz_iconv_t cd)
1034 return cd->my_errno;
1037 int yaz_iconv_close (yaz_iconv_t cd)
1041 iconv_close (cd->iconv_cd);
1050 * indent-tabs-mode: nil
1052 * vim: shiftwidth=4 tabstop=8 expandtab