-/*
- * Copyright (C) 1995-2007, Index Data ApS
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2008 Index Data
* See the file LICENSE for details.
- *
- * $Id: yaz-icu.c,v 1.11 2007-11-08 13:35:14 adam Exp $
*/
#if HAVE_CONFIG_H
#include <yaz/options.h>
-
#if YAZ_HAVE_ICU
#include <unicode/ucnv.h>
#include <unicode/utrans.h>
#include <yaz/icu.h>
+#include <yaz/wrbuf.h>
/* commando line and config parameters */
static struct config_t {
"./yaz-icu -p t -x\n"
"\n"
"Example ICU chain XML configuration file:\n"
- "<icu_chain id=\"en:word\" locale=\"en\">\n"
- " <normalize rule=\"[:Control:] Any-Remove\"/>\n"
+ "<icu_chain locale=\"en\">\n"
+ " <transform rule=\"[:Control:] Any-Remove\"/>\n"
" <tokenize rule=\"l\"/>\n"
- " <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
+ " <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
" <casemap rule=\"l\"/>\n"
"</icu_chain>\n"
);
ucnv_getDefaultName());
}
- for(i=0;i<count;i++){
+ for(i=0;i<count;i++)
+ {
if (p_config->xmloutput)
fprintf(config.outfile, "<converter id=\"%s\"/>\n",
ucnv_getAvailableName(i));
static void print_icu_transliterators(const struct config_t *p_config)
{
int32_t buf_cap = 128;
- char buf[buf_cap];
+ char buf[128];
int32_t i;
int32_t count = utrans_countAvailableIDs();
fprintf(config.outfile, " %s", buf);
}
- if (p_config->xmloutput){
+ if (p_config->xmloutput)
+ {
fprintf(config.outfile, "</transliterators>\n");
}
else
count = uloc_countAvailable() ;
- if (p_config->xmloutput){
-
+ if (p_config->xmloutput)
+ {
fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
count, uloc_getDefault(), ucol_countAvailable());
}
&status);
- if (p_config->xmloutput){
+ if (p_config->xmloutput)
+ {
fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
/* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
/* if (strlen(keyword_str)) */
fprintf(config.outfile, "%s", localname_str);
fprintf(config.outfile, "</locale>\n");
}
- else if (1 == p_config->xmloutput){
+ else if (1 == p_config->xmloutput)
+ {
fprintf(config.outfile, "%s", uloc_getAvailable(i));
fprintf(config.outfile, " | ");
if (strlen(name_str))
else
fprintf(config.outfile, "\n");
- if(U_FAILURE(status)) {
+ if(U_FAILURE(status))
+ {
fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
exit(status);
}
UErrorCode status = U_ZERO_ERROR;
int success = 0;
- if (! xml_node) {
+ if (! xml_node)
+ {
printf("Could not parse XML config file '%s' \n",
config.conffile);
exit (1);
}
- config.chain = icu_chain_xml_config(xml_node, 0, &status);
+ config.chain = icu_chain_xml_config(xml_node, 1, &status);
if (config.chain && U_SUCCESS(status))
success = 1;
config.conffile);
exit (1);
}
-
-
if (p_config->xmloutput)
fprintf(config.outfile,
success = icu_chain_assign_cstr(config.chain, line, &status);
line_count++;
- while (success && icu_chain_next_token(config.chain, &status)){
+ while (success && icu_chain_next_token(config.chain, &status))
+ {
+ WRBUF sw = wrbuf_alloc();
if (U_FAILURE(status))
success = 0;
else {
+ const char *sortkey = icu_chain_token_sortkey(config.chain);
+ wrbuf_rewind(sw);
+ wrbuf_puts_escaped(sw, sortkey);
token_count++;
if (p_config->xmloutput)
+ {
+ /* should XML encode this. Bug #1902 */
fprintf(config.outfile,
- "<token id=\%lu\" line=\"%lu\""
- " norm=\"%s\" display=\"%s\"/>\n",
+ "<token id=\"%lu\" line=\"%lu\""
+ " norm=\"%s\" display=\"%s\" sortkey=\"%s\"/>\n",
token_count,
line_count,
icu_chain_token_norm(config.chain),
- icu_chain_token_display(config.chain));
+ icu_chain_token_display(config.chain),
+ wrbuf_cstr(sw));
+ }
else
- fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
+ fprintf(config.outfile, "%lu %lu '%s' '%s' '%s'\n",
token_count,
line_count,
icu_chain_token_norm(config.chain),
- icu_chain_token_display(config.chain));
+ icu_chain_token_display(config.chain),
+ wrbuf_cstr(sw));
}
+ wrbuf_destroy(sw);
}
}