2 * Copyright (c) 1997-2004, Index Data
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.5 2004-03-16 13:12:43 adam Exp $
8 /* mini iconv and wrapper for system iconv library (if present) */
25 #include <yaz/yaz-util.h>
27 unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
29 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
31 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
33 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
35 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
37 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
39 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
41 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
43 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
46 struct yaz_iconv_struct {
49 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
50 size_t inbytesleft, size_t *no_read);
51 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
52 size_t inbytesleft, size_t *no_read);
53 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
54 char **outbuf, size_t *outbytesleft);
61 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
62 size_t inbytesleft, size_t *no_read)
64 unsigned long x = inp[0];
69 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
70 size_t inbytesleft, size_t *no_read)
79 cd->my_errno = YAZ_ICONV_EINVAL;
82 if (inp[1] != 0xbb || inp[2] != 0xbf)
84 cd->my_errno = YAZ_ICONV_EILSEQ;
91 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
92 size_t inbytesleft, size_t *no_read)
101 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
104 cd->my_errno = YAZ_ICONV_EILSEQ;
106 else if (inp[0] <= 0xdf && inbytesleft >= 2)
108 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
114 cd->my_errno = YAZ_ICONV_EILSEQ;
117 else if (inp[0] <= 0xef && inbytesleft >= 3)
119 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
126 cd->my_errno = YAZ_ICONV_EILSEQ;
129 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
131 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
132 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
138 cd->my_errno = YAZ_ICONV_EILSEQ;
141 else if (inp[0] <= 0xfb && inbytesleft >= 5)
143 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
144 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
151 cd->my_errno = YAZ_ICONV_EILSEQ;
154 else if (inp[0] <= 0xfd && inbytesleft >= 6)
156 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
157 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
158 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
164 cd->my_errno = YAZ_ICONV_EILSEQ;
170 cd->my_errno = YAZ_ICONV_EINVAL;
175 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
176 size_t inbytesleft, size_t *no_read)
182 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
187 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
193 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
194 size_t inbytesleft, size_t *no_read)
200 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
205 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
212 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
213 size_t inbytesleft, size_t *no_read)
217 if (inbytesleft < sizeof(wchar_t))
219 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
225 memcpy (&wch, inp, sizeof(wch));
227 *no_read = sizeof(wch);
233 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
234 size_t inbytesleft, size_t *no_read)
237 while(inbytesleft >= 1 && inp[0] == 27)
239 size_t inbytesleft0 = inbytesleft;
242 while(inbytesleft > 0 && strchr("(,$!", *inp))
247 if (inbytesleft <= 0)
250 cd->my_errno = YAZ_ICONV_EINVAL;
253 cd->marc8_esc_mode = *inp++;
255 (*no_read) += inbytesleft0 - inbytesleft;
257 if (inbytesleft <= 0)
262 size_t no_read_sub = 0;
264 switch(cd->marc8_esc_mode)
266 case 'B': /* Basic ASCII */
267 case 'E': /* ANSEL */
268 case 's': /* ASCII */
269 x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
271 case 'g': /* Greek */
272 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub);
274 case 'b': /* Subscripts */
275 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub);
277 case 'p': /* Superscripts */
278 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub);
280 case '2': /* Basic Hebrew */
281 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub);
283 case 'N': /* Basic Cyrillic */
284 case 'Q': /* Extended Cyrillic */
285 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub);
287 case '3': /* Basic Arabic */
288 case '4': /* Extended Arabic */
289 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub);
291 case 'S': /* Greek */
292 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub);
294 case '1': /* Chinese, Japanese, Korean (EACC) */
295 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub);
299 cd->my_errno = YAZ_ICONV_EILSEQ;
302 *no_read += no_read_sub;
307 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
308 char **outbuf, size_t *outbytesleft)
310 unsigned char *outp = (unsigned char *) *outbuf;
311 if (x <= 0x7f && *outbytesleft >= 1)
313 *outp++ = (unsigned char) x;
316 else if (x <= 0x7ff && *outbytesleft >= 2)
318 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
319 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
320 (*outbytesleft) -= 2;
322 else if (x <= 0xffff && *outbytesleft >= 3)
324 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
325 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
326 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
327 (*outbytesleft) -= 3;
329 else if (x <= 0x1fffff && *outbytesleft >= 4)
331 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
332 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
333 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
334 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
335 (*outbytesleft) -= 4;
337 else if (x <= 0x3ffffff && *outbytesleft >= 5)
339 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
340 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
341 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
342 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
343 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
344 (*outbytesleft) -= 5;
346 else if (*outbytesleft >= 6)
348 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
349 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
350 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
351 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
352 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
353 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
354 (*outbytesleft) -= 6;
358 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
361 *outbuf = (char *) outp;
365 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
366 char **outbuf, size_t *outbytesleft)
368 unsigned char *outp = (unsigned char *) *outbuf;
369 if (x > 255 || x < 1)
371 cd->my_errno = YAZ_ICONV_EILSEQ;
374 else if (*outbytesleft >= 1)
376 *outp++ = (unsigned char) x;
381 cd->my_errno = YAZ_ICONV_E2BIG;
384 *outbuf = (char *) outp;
389 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
390 char **outbuf, size_t *outbytesleft)
392 unsigned char *outp = (unsigned char *) *outbuf;
393 if (*outbytesleft >= 4)
395 *outp++ = (unsigned char) (x>>24);
396 *outp++ = (unsigned char) (x>>16);
397 *outp++ = (unsigned char) (x>>8);
398 *outp++ = (unsigned char) x;
399 (*outbytesleft) -= 4;
403 cd->my_errno = YAZ_ICONV_E2BIG;
406 *outbuf = (char *) outp;
410 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
411 char **outbuf, size_t *outbytesleft)
413 unsigned char *outp = (unsigned char *) *outbuf;
414 if (*outbytesleft >= 4)
416 *outp++ = (unsigned char) x;
417 *outp++ = (unsigned char) (x>>8);
418 *outp++ = (unsigned char) (x>>16);
419 *outp++ = (unsigned char) (x>>24);
420 (*outbytesleft) -= 4;
424 cd->my_errno = YAZ_ICONV_E2BIG;
427 *outbuf = (char *) outp;
432 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
433 char **outbuf, size_t *outbytesleft)
435 unsigned char *outp = (unsigned char *) *outbuf;
437 if (*outbytesleft >= sizeof(wchar_t))
440 memcpy(outp, &wch, sizeof(wch));
442 (*outbytesleft) -= sizeof(wch);
446 cd->my_errno = YAZ_ICONV_E2BIG;
449 *outbuf = (char *) outp;
454 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
456 return cd->read_handle && cd->write_handle;
459 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
461 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
463 cd->write_handle = 0;
466 cd->my_errno = YAZ_ICONV_UNKNOWN;
467 cd->marc8_esc_mode = 'B';
469 /* a useful hack: if fromcode has leading @,
470 the library not use YAZ's own conversions .. */
471 if (fromcode[0] == '@')
475 if (!yaz_matchstr(fromcode, "UTF8"))
477 cd->read_handle = yaz_read_UTF8;
478 cd->init_handle = yaz_init_UTF8;
480 else if (!yaz_matchstr(fromcode, "ISO88591"))
481 cd->read_handle = yaz_read_ISO8859_1;
482 else if (!yaz_matchstr(fromcode, "UCS4"))
483 cd->read_handle = yaz_read_UCS4;
484 else if (!yaz_matchstr(fromcode, "UCS4LE"))
485 cd->read_handle = yaz_read_UCS4LE;
486 else if (!yaz_matchstr(fromcode, "MARC8"))
487 cd->read_handle = yaz_read_marc8;
489 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
490 cd->read_handle = yaz_read_wchar_t;
493 if (!yaz_matchstr(tocode, "UTF8"))
494 cd->write_handle = yaz_write_UTF8;
495 else if (!yaz_matchstr(tocode, "ISO88591"))
496 cd->write_handle = yaz_write_ISO8859_1;
497 else if (!yaz_matchstr (tocode, "UCS4"))
498 cd->write_handle = yaz_write_UCS4;
499 else if (!yaz_matchstr(tocode, "UCS4LE"))
500 cd->write_handle = yaz_write_UCS4LE;
502 else if (!yaz_matchstr(tocode, "WCHAR_T"))
503 cd->write_handle = yaz_write_wchar_t;
508 if (!cd->read_handle || !cd->write_handle)
510 cd->iconv_cd = iconv_open (tocode, fromcode);
511 if (cd->iconv_cd == (iconv_t) (-1))
518 if (!cd->read_handle || !cd->write_handle)
528 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
529 char **outbuf, size_t *outbytesleft)
537 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
538 if (r == (size_t)(-1))
543 cd->my_errno = YAZ_ICONV_E2BIG;
546 cd->my_errno = YAZ_ICONV_EINVAL;
549 cd->my_errno = YAZ_ICONV_EILSEQ;
552 cd->my_errno = YAZ_ICONV_UNKNOWN;
558 if (inbuf == 0 || *inbuf == 0)
561 cd->my_errno = YAZ_ICONV_UNKNOWN;
571 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
572 *inbytesleft, &no_read);
575 if (cd->my_errno == YAZ_ICONV_EINVAL)
580 *inbytesleft -= no_read;
590 if (*inbytesleft == 0)
596 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
605 r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
609 *inbytesleft -= no_read;
615 int yaz_iconv_error (yaz_iconv_t cd)
620 int yaz_iconv_close (yaz_iconv_t cd)
624 iconv_close (cd->iconv_cd);