2 * Copyright (c) 1997-2004, Index Data
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.6 2004-08-07 08:18:19 adam Exp $
8 /* mini iconv and wrapper for system iconv library (if present) */
25 #include <yaz/yaz-util.h>
27 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
28 size_t *no_read, int *combining);
29 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
30 size_t *no_read, int *combining);
31 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
32 size_t *no_read, int *combining);
33 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
34 size_t *no_read, int *combining);
35 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
36 size_t *no_read, int *combining);
37 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
38 size_t *no_read, int *combining);
39 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
46 struct yaz_iconv_struct {
49 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
50 size_t inbytesleft, size_t *no_read);
51 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
52 size_t inbytesleft, size_t *no_read);
53 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
54 char **outbuf, size_t *outbytesleft);
57 int marc8_comb_no_read;
63 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
64 size_t inbytesleft, size_t *no_read)
66 unsigned long x = inp[0];
71 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
72 size_t inbytesleft, size_t *no_read)
81 cd->my_errno = YAZ_ICONV_EINVAL;
84 if (inp[1] != 0xbb || inp[2] != 0xbf)
86 cd->my_errno = YAZ_ICONV_EILSEQ;
93 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
94 size_t inbytesleft, size_t *no_read)
103 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
106 cd->my_errno = YAZ_ICONV_EILSEQ;
108 else if (inp[0] <= 0xdf && inbytesleft >= 2)
110 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
116 cd->my_errno = YAZ_ICONV_EILSEQ;
119 else if (inp[0] <= 0xef && inbytesleft >= 3)
121 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
128 cd->my_errno = YAZ_ICONV_EILSEQ;
131 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
133 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
134 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
140 cd->my_errno = YAZ_ICONV_EILSEQ;
143 else if (inp[0] <= 0xfb && inbytesleft >= 5)
145 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
146 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
153 cd->my_errno = YAZ_ICONV_EILSEQ;
156 else if (inp[0] <= 0xfd && inbytesleft >= 6)
158 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
159 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
160 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
166 cd->my_errno = YAZ_ICONV_EILSEQ;
172 cd->my_errno = YAZ_ICONV_EINVAL;
177 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
178 size_t inbytesleft, size_t *no_read)
184 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
189 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
195 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
196 size_t inbytesleft, size_t *no_read)
202 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
214 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
215 size_t inbytesleft, size_t *no_read)
219 if (inbytesleft < sizeof(wchar_t))
221 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
227 memcpy (&wch, inp, sizeof(wch));
229 *no_read = sizeof(wch);
235 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
236 size_t inbytesleft, size_t *no_read)
238 if (cd->marc8_comb_x)
240 unsigned long x = cd->marc8_comb_x;
241 *no_read = cd->marc8_comb_no_read;
242 cd->marc8_comb_x = 0;
246 while(inbytesleft >= 1 && inp[0] == 27)
248 size_t inbytesleft0 = inbytesleft;
251 while(inbytesleft > 0 && strchr("(,$!", *inp))
256 if (inbytesleft <= 0)
259 cd->my_errno = YAZ_ICONV_EINVAL;
262 cd->marc8_esc_mode = *inp++;
264 (*no_read) += inbytesleft0 - inbytesleft;
266 if (inbytesleft <= 0)
272 size_t no_read_sub = 0;
274 switch(cd->marc8_esc_mode)
276 case 'B': /* Basic ASCII */
277 case 'E': /* ANSEL */
278 case 's': /* ASCII */
279 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
281 case 'g': /* Greek */
282 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
284 case 'b': /* Subscripts */
285 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
287 case 'p': /* Superscripts */
288 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
290 case '2': /* Basic Hebrew */
291 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
293 case 'N': /* Basic Cyrillic */
294 case 'Q': /* Extended Cyrillic */
295 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
297 case '3': /* Basic Arabic */
298 case '4': /* Extended Arabic */
299 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
301 case 'S': /* Greek */
302 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
304 case '1': /* Chinese, Japanese, Korean (EACC) */
305 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
309 cd->my_errno = YAZ_ICONV_EILSEQ;
313 printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
315 *no_read += no_read_sub;
317 if (comb && cd->marc8_comb_x == 0)
320 unsigned long next_x;
322 /* read next char .. */
323 next_x = yaz_read_marc8(cd, inp + *no_read,
324 inbytesleft - *no_read, &tmp_read);
325 /* save this x for later .. */
326 cd->marc8_comb_x = x;
327 /* save next read for later .. */
328 cd->marc8_comb_no_read = tmp_read;
329 /* return next x - thereby swap */
336 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
337 char **outbuf, size_t *outbytesleft)
339 unsigned char *outp = (unsigned char *) *outbuf;
340 if (x <= 0x7f && *outbytesleft >= 1)
342 *outp++ = (unsigned char) x;
345 else if (x <= 0x7ff && *outbytesleft >= 2)
347 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
348 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
349 (*outbytesleft) -= 2;
351 else if (x <= 0xffff && *outbytesleft >= 3)
353 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
354 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
355 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
356 (*outbytesleft) -= 3;
358 else if (x <= 0x1fffff && *outbytesleft >= 4)
360 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
361 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
362 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
363 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
364 (*outbytesleft) -= 4;
366 else if (x <= 0x3ffffff && *outbytesleft >= 5)
368 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
369 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
370 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
371 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
372 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
373 (*outbytesleft) -= 5;
375 else if (*outbytesleft >= 6)
377 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
378 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
379 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
380 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
381 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
382 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
383 (*outbytesleft) -= 6;
387 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
390 *outbuf = (char *) outp;
394 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
395 char **outbuf, size_t *outbytesleft)
397 unsigned char *outp = (unsigned char *) *outbuf;
398 if (x > 255 || x < 1)
400 cd->my_errno = YAZ_ICONV_EILSEQ;
403 else if (*outbytesleft >= 1)
405 *outp++ = (unsigned char) x;
410 cd->my_errno = YAZ_ICONV_E2BIG;
413 *outbuf = (char *) outp;
418 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
419 char **outbuf, size_t *outbytesleft)
421 unsigned char *outp = (unsigned char *) *outbuf;
422 if (*outbytesleft >= 4)
424 *outp++ = (unsigned char) (x>>24);
425 *outp++ = (unsigned char) (x>>16);
426 *outp++ = (unsigned char) (x>>8);
427 *outp++ = (unsigned char) x;
428 (*outbytesleft) -= 4;
432 cd->my_errno = YAZ_ICONV_E2BIG;
435 *outbuf = (char *) outp;
439 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
440 char **outbuf, size_t *outbytesleft)
442 unsigned char *outp = (unsigned char *) *outbuf;
443 if (*outbytesleft >= 4)
445 *outp++ = (unsigned char) x;
446 *outp++ = (unsigned char) (x>>8);
447 *outp++ = (unsigned char) (x>>16);
448 *outp++ = (unsigned char) (x>>24);
449 (*outbytesleft) -= 4;
453 cd->my_errno = YAZ_ICONV_E2BIG;
456 *outbuf = (char *) outp;
461 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
462 char **outbuf, size_t *outbytesleft)
464 unsigned char *outp = (unsigned char *) *outbuf;
466 if (*outbytesleft >= sizeof(wchar_t))
469 memcpy(outp, &wch, sizeof(wch));
471 (*outbytesleft) -= sizeof(wch);
475 cd->my_errno = YAZ_ICONV_E2BIG;
478 *outbuf = (char *) outp;
483 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
485 return cd->read_handle && cd->write_handle;
488 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
490 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
492 cd->write_handle = 0;
495 cd->my_errno = YAZ_ICONV_UNKNOWN;
496 cd->marc8_esc_mode = 'B';
497 cd->marc8_comb_x = 0;
499 /* a useful hack: if fromcode has leading @,
500 the library not use YAZ's own conversions .. */
501 if (fromcode[0] == '@')
505 if (!yaz_matchstr(fromcode, "UTF8"))
507 cd->read_handle = yaz_read_UTF8;
508 cd->init_handle = yaz_init_UTF8;
510 else if (!yaz_matchstr(fromcode, "ISO88591"))
511 cd->read_handle = yaz_read_ISO8859_1;
512 else if (!yaz_matchstr(fromcode, "UCS4"))
513 cd->read_handle = yaz_read_UCS4;
514 else if (!yaz_matchstr(fromcode, "UCS4LE"))
515 cd->read_handle = yaz_read_UCS4LE;
516 else if (!yaz_matchstr(fromcode, "MARC8"))
517 cd->read_handle = yaz_read_marc8;
519 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
520 cd->read_handle = yaz_read_wchar_t;
523 if (!yaz_matchstr(tocode, "UTF8"))
524 cd->write_handle = yaz_write_UTF8;
525 else if (!yaz_matchstr(tocode, "ISO88591"))
526 cd->write_handle = yaz_write_ISO8859_1;
527 else if (!yaz_matchstr (tocode, "UCS4"))
528 cd->write_handle = yaz_write_UCS4;
529 else if (!yaz_matchstr(tocode, "UCS4LE"))
530 cd->write_handle = yaz_write_UCS4LE;
532 else if (!yaz_matchstr(tocode, "WCHAR_T"))
533 cd->write_handle = yaz_write_wchar_t;
538 if (!cd->read_handle || !cd->write_handle)
540 cd->iconv_cd = iconv_open (tocode, fromcode);
541 if (cd->iconv_cd == (iconv_t) (-1))
548 if (!cd->read_handle || !cd->write_handle)
558 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
559 char **outbuf, size_t *outbytesleft)
567 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
568 if (r == (size_t)(-1))
573 cd->my_errno = YAZ_ICONV_E2BIG;
576 cd->my_errno = YAZ_ICONV_EINVAL;
579 cd->my_errno = YAZ_ICONV_EILSEQ;
582 cd->my_errno = YAZ_ICONV_UNKNOWN;
588 if (inbuf == 0 || *inbuf == 0)
591 cd->my_errno = YAZ_ICONV_UNKNOWN;
601 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
602 *inbytesleft, &no_read);
605 if (cd->my_errno == YAZ_ICONV_EINVAL)
610 *inbytesleft -= no_read;
620 if (*inbytesleft == 0)
626 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
635 r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
639 *inbytesleft -= no_read;
645 int yaz_iconv_error (yaz_iconv_t cd)
650 int yaz_iconv_close (yaz_iconv_t cd)
654 iconv_close (cd->iconv_cd);