From 8ceaeefe2e491935cba91f56007308be6e4996e6 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 14 Nov 2014 15:03:08 +0100 Subject: [PATCH] marc8 + ISO2709 checks for leader 9 YAZ-800 --- include/yaz/marcdisp.h | 14 ++++++++++++++ src/marcdisp.c | 10 ++++++++++ src/opac_to_xml.c | 16 ++++++++++++++++ src/record_conv.c | 27 +++++++++++++++++++-------- src/record_render.c | 28 +++++++++++++++++++++++----- test/test_record_conv.c | 2 +- util/marcdump.c | 15 +++++++++++++++ 7 files changed, 98 insertions(+), 14 deletions(-) diff --git a/include/yaz/marcdisp.h b/include/yaz/marcdisp.h index d7bbaa0..fb0bab8 100644 --- a/include/yaz/marcdisp.h +++ b/include/yaz/marcdisp.h @@ -484,6 +484,20 @@ struct json_node; YAZ_EXPORT int yaz_marc_read_json_node(yaz_marc_t mt, struct json_node *n); +/** \brief check if MARC21 is UTF-8 encoded + \param charset that is given by user + \param marc_buf ISO2709 buf + \param sz ISO2709 size + \retval 1 is probably UTF-8 + \retval 0 is not UTF-8 +*/ +YAZ_EXPORT +int yaz_marc_check_marc21_coding(const char *charset, + const char *marc_buf, int sz); + +YAZ_EXPORT +int yaz_opac_check_marc21_coding(const char *charset, Z_OPACRecord *r); + YAZ_END_CDECL #endif diff --git a/src/marcdisp.c b/src/marcdisp.c index 3271889..85acb07 100644 --- a/src/marcdisp.c +++ b/src/marcdisp.c @@ -1465,6 +1465,16 @@ void yaz_marc_write_using_libxml2(yaz_marc_t mt, int enable) mt->write_using_libxml2 = enable; } +int yaz_marc_check_marc21_coding(const char *charset, + const char *marc_buf, int sz) +{ + if ((!yaz_matchstr(charset, "MARC8?") || + !yaz_matchstr(charset, "MARC8")) && marc_buf && sz > 25 + && marc_buf[9] == 'a') + return 1; + return 0; +} + /* * Local variables: * c-basic-offset: 4 diff --git a/src/opac_to_xml.c b/src/opac_to_xml.c index 1f0c9a3..f365082 100644 --- a/src/opac_to_xml.c +++ b/src/opac_to_xml.c @@ -191,6 +191,22 @@ void yaz_opac_decode_wrbuf(yaz_marc_t mt, Z_OPACRecord *r, WRBUF wrbuf) yaz_opac_decode_wrbuf2(mt, r, wrbuf, 0); } +int yaz_opac_check_marc21_coding(const char *charset, Z_OPACRecord *r) +{ + if (r->bibliographicRecord) + { + Z_External *ext = r->bibliographicRecord; + if (ext->which == Z_External_octet) + { + return yaz_marc_check_marc21_coding( + charset, + (const char *) ext->u.octet_aligned->buf, + ext->u.octet_aligned->len); + } + } + return 0; +} + /* * Local variables: * c-basic-offset: 4 diff --git a/src/record_conv.c b/src/record_conv.c index e67ef00..9ab5b71 100644 --- a/src/record_conv.c +++ b/src/record_conv.c @@ -519,23 +519,25 @@ static void *construct_marc(const xmlNode *ptr, static int convert_marc(void *info, WRBUF record, WRBUF wr_error) { struct marc_info *mi = info; + const char *input_charset = mi->input_charset; int ret = 0; - - yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset); yaz_marc_t mt = yaz_marc_create(); yaz_marc_xml(mt, mi->output_format_mode); if (mi->leader_spec) yaz_marc_leader_spec(mt, mi->leader_spec); - if (cd) - yaz_marc_iconv(mt, cd); if (mi->input_format_mode == YAZ_MARC_ISO2709) { int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record), wrbuf_len(record)); if (sz > 0) + { + if (yaz_marc_check_marc21_coding(input_charset, wrbuf_buf(record), + wrbuf_len(record))) + input_charset = "utf-8"; ret = 0; + } else ret = -1; } @@ -564,13 +566,18 @@ static int convert_marc(void *info, WRBUF record, WRBUF wr_error) } if (ret == 0) { + yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, input_charset); + + if (cd) + yaz_marc_iconv(mt, cd); + wrbuf_rewind(record); ret = yaz_marc_write_mode(mt, record); if (ret) wrbuf_printf(wr_error, "yaz_marc_write_mode failed"); + if (cd) + yaz_iconv_close(cd); } - if (cd) - yaz_iconv_close(cd); yaz_marc_destroy(mt); return ret; } @@ -680,11 +687,15 @@ int yaz_record_conv_opac_record(yaz_record_conv_t p, else { struct marc_info *mi = r->info; + const char *input_charset = mi->input_charset; + yaz_iconv_t cd; WRBUF res = wrbuf_alloc(); yaz_marc_t mt = yaz_marc_create(); - yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, - mi->input_charset); + + if (yaz_opac_check_marc21_coding(input_charset, input_record)) + input_charset = "utf-8"; + cd = yaz_iconv_open(mi->output_charset, input_charset); wrbuf_rewind(p->wr_error); yaz_marc_xml(mt, mi->output_format_mode); diff --git a/src/record_render.c b/src/record_render.c index 1291858..82fa698 100644 --- a/src/record_render.c +++ b/src/record_render.c @@ -28,7 +28,9 @@ #endif static yaz_iconv_t iconv_create_charset(const char *record_charset, - yaz_iconv_t *cd2) + yaz_iconv_t *cd2, + const char *marc_buf, + int sz) { char charset_buf[40]; yaz_iconv_t cd = 0; @@ -62,7 +64,11 @@ static yaz_iconv_t iconv_create_charset(const char *record_charset, } if (from_set1) + { + if (yaz_marc_check_marc21_coding(from_set1, marc_buf, sz)) + from_set1 = "utf-8"; cd = yaz_iconv_open(to_set, from_set1); + } if (cd2) { if (from_set2) @@ -79,7 +85,7 @@ static const char *return_marc_record(WRBUF wrbuf, const char *buf, int sz, const char *record_charset) { - yaz_iconv_t cd = iconv_create_charset(record_charset, 0); + yaz_iconv_t cd = iconv_create_charset(record_charset, 0, buf, sz); yaz_marc_t mt = yaz_marc_create(); const char *ret_string = 0; @@ -103,10 +109,22 @@ static const char *return_opac_record(WRBUF wrbuf, Z_OPACRecord *opac_rec, const char *record_charset) { - yaz_iconv_t cd2; - yaz_iconv_t cd = iconv_create_charset(record_charset, &cd2); + yaz_iconv_t cd, cd2; + const char *marc_buf = 0; + int marc_sz = 0; yaz_marc_t mt = yaz_marc_create(); + if (opac_rec->bibliographicRecord) + { + Z_External *ext = opac_rec->bibliographicRecord; + if (ext->which == Z_External_octet) + { + marc_buf = (const char *) ext->u.octet_aligned->buf; + marc_sz = ext->u.octet_aligned->len; + } + } + cd = iconv_create_charset(record_charset, &cd2, marc_buf, marc_sz); + if (cd) yaz_marc_iconv(mt, cd); yaz_marc_xml(mt, marc_type); @@ -131,7 +149,7 @@ static const char *return_string_record(WRBUF wrbuf, const char *buf, int sz, const char *record_charset) { - yaz_iconv_t cd = iconv_create_charset(record_charset, 0); + yaz_iconv_t cd = iconv_create_charset(record_charset, 0, 0, 0); if (cd) { diff --git a/test/test_record_conv.c b/test/test_record_conv.c index f388ea4..ff9a2a8 100644 --- a/test/test_record_conv.c +++ b/test/test_record_conv.c @@ -369,7 +369,7 @@ static void tst_convert3(void) yaz_record_conv_t p = 0; const char *iso2709_rec = - "\x30\x30\x30\x37\x37\x6E\x61\x6D\x20\x61\x32\x32\x30\x30\x30\x34" + "\x30\x30\x30\x37\x37\x6E\x61\x6D\x20\x20\x32\x32\x30\x30\x30\x34" "\x39\x38\x61\x20\x34\x35\x30\x30\x30\x30\x31\x30\x30\x31\x33\x30" "\x30\x30\x30\x30\x30\x31\x30\x30\x30\x31\x34\x30\x30\x30\x31\x33" "\x1E\x20\x20\x20\x31\x31\x32\x32\x34\x34\x36\x36\x20\x1E\x20\x20" diff --git a/util/marcdump.c b/util/marcdump.c index 850331b..c45d146 100644 --- a/util/marcdump.c +++ b/util/marcdump.c @@ -319,6 +319,7 @@ static void dump(const char *fname, const char *from, const char *to, size_t len_result; size_t r; char buf[100001]; + yaz_iconv_t cd1 = 0; r = fread(buf, 1, 5, inf); if (r < 5) @@ -428,7 +429,21 @@ static void dump(const char *fname, const char *from, const char *to, } } len_result = rlen; + + if (yaz_marc_check_marc21_coding(from, buf, 26)) + { + cd1 = yaz_iconv_open(to, "utf-8"); + if (cd1) + yaz_marc_iconv(mt, cd); + } r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result); + + if (cd1) + { + yaz_iconv_close(cd1); + yaz_marc_iconv(mt, cd); + } + if (r == -1) no_errors++; if (r > 0 && result && len_result) -- 1.7.10.4