2 * Copyright (C) 1995-2005, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.9 2005-02-01 21:06:37 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversion: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
53 struct yaz_iconv_struct {
56 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
57 size_t inbytesleft, size_t *no_read);
58 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
59 size_t inbytesleft, size_t *no_read);
60 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
61 char **outbuf, size_t *outbytesleft);
64 int marc8_comb_no_read;
72 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
73 size_t inbytesleft, size_t *no_read)
75 unsigned long x = inp[0];
80 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
81 size_t inbytesleft, size_t *no_read)
90 cd->my_errno = YAZ_ICONV_EINVAL;
93 if (inp[1] != 0xbb || inp[2] != 0xbf)
95 cd->my_errno = YAZ_ICONV_EILSEQ;
102 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
103 size_t inbytesleft, size_t *no_read)
112 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
115 cd->my_errno = YAZ_ICONV_EILSEQ;
117 else if (inp[0] <= 0xdf && inbytesleft >= 2)
119 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
125 cd->my_errno = YAZ_ICONV_EILSEQ;
128 else if (inp[0] <= 0xef && inbytesleft >= 3)
130 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
137 cd->my_errno = YAZ_ICONV_EILSEQ;
140 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
142 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
143 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
149 cd->my_errno = YAZ_ICONV_EILSEQ;
152 else if (inp[0] <= 0xfb && inbytesleft >= 5)
154 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
155 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
162 cd->my_errno = YAZ_ICONV_EILSEQ;
165 else if (inp[0] <= 0xfd && inbytesleft >= 6)
167 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
168 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
169 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
175 cd->my_errno = YAZ_ICONV_EILSEQ;
181 cd->my_errno = YAZ_ICONV_EINVAL;
186 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
187 size_t inbytesleft, size_t *no_read)
193 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
198 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
204 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
205 size_t inbytesleft, size_t *no_read)
211 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
216 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
223 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
224 size_t inbytesleft, size_t *no_read)
228 if (inbytesleft < sizeof(wchar_t))
230 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
236 memcpy (&wch, inp, sizeof(wch));
238 *no_read = sizeof(wch);
244 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
245 size_t inbytesleft, size_t *no_read)
247 if (cd->marc8_comb_x)
249 unsigned long x = cd->marc8_comb_x;
250 *no_read = cd->marc8_comb_no_read;
251 cd->marc8_comb_x = 0;
255 while(inbytesleft >= 1 && inp[0] == 27)
257 size_t inbytesleft0 = inbytesleft;
260 while(inbytesleft > 0 && strchr("(,$!", *inp))
265 if (inbytesleft <= 0)
268 cd->my_errno = YAZ_ICONV_EINVAL;
271 cd->marc8_esc_mode = *inp++;
273 (*no_read) += inbytesleft0 - inbytesleft;
275 if (inbytesleft <= 0)
281 size_t no_read_sub = 0;
283 switch(cd->marc8_esc_mode)
285 case 'B': /* Basic ASCII */
286 case 'E': /* ANSEL */
287 case 's': /* ASCII */
288 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
290 case 'g': /* Greek */
291 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
293 case 'b': /* Subscripts */
294 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
296 case 'p': /* Superscripts */
297 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
299 case '2': /* Basic Hebrew */
300 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
302 case 'N': /* Basic Cyrillic */
303 case 'Q': /* Extended Cyrillic */
304 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
306 case '3': /* Basic Arabic */
307 case '4': /* Extended Arabic */
308 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
310 case 'S': /* Greek */
311 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
313 case '1': /* Chinese, Japanese, Korean (EACC) */
314 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
318 cd->my_errno = YAZ_ICONV_EILSEQ;
322 printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
324 *no_read += no_read_sub;
326 if (comb && cd->marc8_comb_x == 0)
329 unsigned long next_x;
331 /* read next char .. */
332 next_x = yaz_read_marc8(cd, inp + *no_read,
333 inbytesleft - *no_read, &tmp_read);
334 /* save this x for later .. */
335 cd->marc8_comb_x = x;
336 /* save next read for later .. */
337 cd->marc8_comb_no_read = tmp_read;
338 /* return next x - thereby swap */
345 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
346 char **outbuf, size_t *outbytesleft)
348 unsigned char *outp = (unsigned char *) *outbuf;
349 if (x <= 0x7f && *outbytesleft >= 1)
351 *outp++ = (unsigned char) x;
354 else if (x <= 0x7ff && *outbytesleft >= 2)
356 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
357 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
358 (*outbytesleft) -= 2;
360 else if (x <= 0xffff && *outbytesleft >= 3)
362 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
363 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
364 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
365 (*outbytesleft) -= 3;
367 else if (x <= 0x1fffff && *outbytesleft >= 4)
369 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
370 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
371 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
372 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
373 (*outbytesleft) -= 4;
375 else if (x <= 0x3ffffff && *outbytesleft >= 5)
377 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
378 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
379 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
380 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
381 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
382 (*outbytesleft) -= 5;
384 else if (*outbytesleft >= 6)
386 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
387 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
388 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
389 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
390 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
391 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
392 (*outbytesleft) -= 6;
396 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
399 *outbuf = (char *) outp;
403 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
404 char **outbuf, size_t *outbytesleft)
406 unsigned char *outp = (unsigned char *) *outbuf;
407 if (x > 255 || x < 1)
409 cd->my_errno = YAZ_ICONV_EILSEQ;
412 else if (*outbytesleft >= 1)
414 *outp++ = (unsigned char) x;
419 cd->my_errno = YAZ_ICONV_E2BIG;
422 *outbuf = (char *) outp;
427 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
428 char **outbuf, size_t *outbytesleft)
430 unsigned char *outp = (unsigned char *) *outbuf;
431 if (*outbytesleft >= 4)
433 *outp++ = (unsigned char) (x>>24);
434 *outp++ = (unsigned char) (x>>16);
435 *outp++ = (unsigned char) (x>>8);
436 *outp++ = (unsigned char) x;
437 (*outbytesleft) -= 4;
441 cd->my_errno = YAZ_ICONV_E2BIG;
444 *outbuf = (char *) outp;
448 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
449 char **outbuf, size_t *outbytesleft)
451 unsigned char *outp = (unsigned char *) *outbuf;
452 if (*outbytesleft >= 4)
454 *outp++ = (unsigned char) x;
455 *outp++ = (unsigned char) (x>>8);
456 *outp++ = (unsigned char) (x>>16);
457 *outp++ = (unsigned char) (x>>24);
458 (*outbytesleft) -= 4;
462 cd->my_errno = YAZ_ICONV_E2BIG;
465 *outbuf = (char *) outp;
470 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
471 char **outbuf, size_t *outbytesleft)
473 unsigned char *outp = (unsigned char *) *outbuf;
475 if (*outbytesleft >= sizeof(wchar_t))
478 memcpy(outp, &wch, sizeof(wch));
480 (*outbytesleft) -= sizeof(wch);
484 cd->my_errno = YAZ_ICONV_E2BIG;
487 *outbuf = (char *) outp;
492 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
494 return cd->read_handle && cd->write_handle;
497 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
499 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
501 cd->write_handle = 0;
504 cd->my_errno = YAZ_ICONV_UNKNOWN;
505 cd->marc8_esc_mode = 'B';
506 cd->marc8_comb_x = 0;
508 /* a useful hack: if fromcode has leading @,
509 the library not use YAZ's own conversions .. */
510 if (fromcode[0] == '@')
514 if (!yaz_matchstr(fromcode, "UTF8"))
516 cd->read_handle = yaz_read_UTF8;
517 cd->init_handle = yaz_init_UTF8;
519 else if (!yaz_matchstr(fromcode, "ISO88591"))
520 cd->read_handle = yaz_read_ISO8859_1;
521 else if (!yaz_matchstr(fromcode, "UCS4"))
522 cd->read_handle = yaz_read_UCS4;
523 else if (!yaz_matchstr(fromcode, "UCS4LE"))
524 cd->read_handle = yaz_read_UCS4LE;
525 else if (!yaz_matchstr(fromcode, "MARC8"))
526 cd->read_handle = yaz_read_marc8;
528 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
529 cd->read_handle = yaz_read_wchar_t;
532 if (!yaz_matchstr(tocode, "UTF8"))
533 cd->write_handle = yaz_write_UTF8;
534 else if (!yaz_matchstr(tocode, "ISO88591"))
535 cd->write_handle = yaz_write_ISO8859_1;
536 else if (!yaz_matchstr (tocode, "UCS4"))
537 cd->write_handle = yaz_write_UCS4;
538 else if (!yaz_matchstr(tocode, "UCS4LE"))
539 cd->write_handle = yaz_write_UCS4LE;
541 else if (!yaz_matchstr(tocode, "WCHAR_T"))
542 cd->write_handle = yaz_write_wchar_t;
547 if (!cd->read_handle || !cd->write_handle)
549 cd->iconv_cd = iconv_open (tocode, fromcode);
550 if (cd->iconv_cd == (iconv_t) (-1))
557 if (!cd->read_handle || !cd->write_handle)
567 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
568 char **outbuf, size_t *outbytesleft)
576 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
577 if (r == (size_t)(-1))
582 cd->my_errno = YAZ_ICONV_E2BIG;
585 cd->my_errno = YAZ_ICONV_EINVAL;
588 cd->my_errno = YAZ_ICONV_EILSEQ;
591 cd->my_errno = YAZ_ICONV_UNKNOWN;
597 if (inbuf == 0 || *inbuf == 0)
600 cd->my_errno = YAZ_ICONV_UNKNOWN;
610 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
611 *inbytesleft, &no_read);
614 if (cd->my_errno == YAZ_ICONV_EINVAL)
619 *inbytesleft -= no_read;
631 if (*inbytesleft == 0)
638 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
649 no_read = cd->no_read_x;
653 r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
656 /* unable to write it. save it because read_handle cannot
659 cd->no_read_x = no_read;
664 *inbytesleft -= no_read;
670 int yaz_iconv_error (yaz_iconv_t cd)
675 int yaz_iconv_close (yaz_iconv_t cd)
679 iconv_close (cd->iconv_cd);