2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: yaz-icu.c,v 1.6 2007-10-29 10:22:23 marc Exp $
17 #include <yaz/options.h>
22 #include <unicode/ucnv.h>
23 #include <unicode/ustring.h>
25 #include <yaz/icu_I18N.h>
27 /* commando line and config parameters */
28 static struct config_t {
32 struct icu_chain * chain;
39 void print_option_error(const struct config_t *p_config)
41 fprintf(stderr, "Calling error, valid options are :\n");
42 fprintf(stderr, "yaz-icu\n"
43 " [-c (path/to/config/file.xml)]\n"
44 " [-p (a|c|l|t)] print ICU info \n"
48 "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
53 "Example ICU chain XML configuration file:\n"
54 "<icu_chain id=\"en:word\" locale=\"en\">\n"
55 " <normalize rule=\"[:Control:] Any-Remove\"/>\n"
56 " <tokenize rule=\"l\"/>\n"
57 " <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
59 " <casemap rule=\"l\"/>\n"
65 void read_params(int argc, char **argv, struct config_t *p_config)
70 /* set default parameters */
71 p_config->conffile[0] = 0;
72 p_config->print[0] = 0;
73 p_config->xmloutput = 0;
75 p_config->infile = stdin;
76 p_config->outfile = stdout;
78 /* set up command line parameters */
80 while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
85 strcpy(p_config->conffile, arg);
88 strcpy(p_config->print, arg);
91 p_config->xmloutput = 1;
94 print_option_error(p_config);
98 if ((!strlen(p_config->conffile)
99 && !strlen(p_config->print))
103 print_option_error(p_config);
107 /* UConverter *conv; */
108 /* conv = ucnv_open("utf-8", &status); */
109 /* assert(U_SUCCESS(status)); */
112 /* = ucnv_toUChars(conv, ustr16, 1024, */
113 /* (const char *) *xstr8, strlen((const char *) *xstr8), */
118 /* ucnv_fromUChars(conv, */
119 /* (char *) *xstr8, strlen((const char *) *xstr8), */
120 /* ustr16, *ustr16_len, */
122 /* ucnv_close(conv); */
125 static void print_icu_converters(const struct config_t *p_config)
130 count = ucnv_countAvailable();
131 if (p_config->xmloutput)
132 fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
133 count, ucnv_getDefaultName());
135 fprintf(config.outfile, "Available ICU converters: %d\n", count);
136 fprintf(config.outfile, "Default ICU Converter is: '%s'\n",
137 ucnv_getDefaultName());
140 for(i=0;i<count;i++){
141 if (p_config->xmloutput)
142 fprintf(config.outfile, "<converter id=\"%s\"/>\n",
143 ucnv_getAvailableName(i));
145 fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
148 if (p_config->xmloutput)
149 fprintf(config.outfile, "</converters>\n");
151 fprintf(config.outfile, "\n");
154 static void print_icu_transliterators(const struct config_t *p_config)
156 int32_t buf_cap = 128;
159 int32_t count = utrans_countAvailableIDs();
161 if (p_config->xmloutput)
162 fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
164 fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
166 for(i = 0; i <count; i++)
168 utrans_getAvailableID(i, buf, buf_cap);
169 if (p_config->xmloutput)
170 fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
172 fprintf(config.outfile, " %s", buf);
175 if (p_config->xmloutput){
176 fprintf(config.outfile, "</transliterators>\n");
180 fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
181 " Pattern Description\n"
182 " Ranges [a-z] The lower case letters a through z\n"
183 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
184 " String [abc{def}] chars a, b and c, and string 'def'\n"
185 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
186 " Categories [:Letter:] Posix General Category 'Letter'.\n"
188 " Combination Example\n"
189 " Union [[:Greek:] [:letter:]]\n"
190 " Intersection [[:Greek:] & [:letter:]]\n"
191 " Set Complement [[:Greek:] - [:letter:]]\n"
192 " Complement [^[:Greek:] [:letter:]]\n"
194 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
197 " [:Punctuation:] Any-Remove\n"
198 " [:Cased-Letter:] Any-Upper\n"
199 " [:Control:] Any-Remove\n"
200 " [:Decimal_Number:] Any-Remove\n"
201 " [:Final_Punctuation:] Any-Remove\n"
202 " [:Georgian:] Any-Upper\n"
203 " [:Katakana:] Any-Remove\n"
204 " [:Arabic:] Any-Remove\n"
205 " [:Punctuation:] Remove\n"
206 " [[:Punctuation:]-[.,]] Remove\n"
207 " [:Line_Separator:] Any-Remove\n"
208 " [:Math_Symbol:] Any-Remove\n"
209 " Lower; [:^Letter:] Remove (word tokenization)\n"
210 " [:^Number:] Remove (numeric tokenization)\n"
211 " [:^Katagana:] Remove (remove everything except Katagana)\n"
212 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
213 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
214 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
215 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
217 "see http://icu.sourceforge.net/userguide/Transform.html\n"
218 " http://www.unicode.org/Public/UNIDATA/UCD.html\n"
219 " http://icu.sourceforge.net/userguide/Transform.html\n"
220 " http://icu.sourceforge.net/userguide/TransformRule.html\n"
224 fprintf(config.outfile, "\n\n");
229 static void print_icu_xml_locales(const struct config_t *p_config)
233 UErrorCode status = U_ZERO_ERROR;
236 int32_t keyword_len = 0;
237 char keyword_str[128];
238 int32_t keyword_str_len = 0;
241 int32_t language_len = 0;
243 int32_t lang_str_len = 0;
246 int32_t script_len = 0;
247 char script_str[128];
248 int32_t script_str_len = 0;
251 int32_t location_len = 0;
252 char location_str[128];
253 int32_t location_str_len = 0;
256 int32_t variant_len = 0;
257 char variant_str[128];
258 int32_t variant_str_len = 0;
261 int32_t name_len = 0;
263 int32_t name_str_len = 0;
266 int32_t localname_len = 0;
267 char localname_str[128];
268 int32_t localname_str_len = 0;
270 count = uloc_countAvailable() ;
272 if (p_config->xmloutput){
274 fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
275 count, uloc_getDefault(), ucol_countAvailable());
282 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
286 u_strToUTF8(keyword_str, 128, &keyword_str_len,
287 keyword, keyword_len,
292 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
296 u_strToUTF8(lang_str, 128, &lang_str_len,
297 language, language_len,
302 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
306 u_strToUTF8(script_str, 128, &script_str_len,
311 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
315 u_strToUTF8(location_str, 128, &location_str_len,
316 location, location_len,
320 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
324 u_strToUTF8(variant_str, 128, &variant_str_len,
325 variant, variant_len,
329 = uloc_getDisplayName(uloc_getAvailable(i), "en",
333 u_strToUTF8(name_str, 128, &name_str_len,
338 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
342 u_strToUTF8(localname_str, 128, &localname_str_len,
343 localname, localname_len,
347 if (p_config->xmloutput){
348 fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
349 /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
350 /* if (strlen(keyword_str)) */
351 /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
352 /* if (ucol_getAvailable(i)) */
353 /* fprintf(config.outfile, " collation=\"1\""); */
354 if (strlen(lang_str))
355 fprintf(config.outfile, " language=\"%s\"", lang_str);
356 if (strlen(script_str))
357 fprintf(config.outfile, " script=\"%s\"", script_str);
358 if (strlen(location_str))
359 fprintf(config.outfile, " location=\"%s\"", location_str);
360 if (strlen(variant_str))
361 fprintf(config.outfile, " variant=\"%s\"", variant_str);
362 if (strlen(name_str))
363 fprintf(config.outfile, " name=\"%s\"", name_str);
364 if (strlen(localname_str))
365 fprintf(config.outfile, " localname=\"%s\"", localname_str);
366 fprintf(config.outfile, ">");
367 if (strlen(localname_str))
368 fprintf(config.outfile, "%s", localname_str);
369 fprintf(config.outfile, "</locale>\n");
371 else if (1 == p_config->xmloutput){
372 fprintf(config.outfile, "%s", uloc_getAvailable(i));
373 fprintf(config.outfile, " | ");
374 if (strlen(name_str))
375 fprintf(config.outfile, "%s", name_str);
376 fprintf(config.outfile, " | ");
377 if (strlen(localname_str))
378 fprintf(config.outfile, "%s", localname_str);
379 fprintf(config.outfile, "\n");
382 fprintf(config.outfile, "%s ", uloc_getAvailable(i));
384 if (p_config->xmloutput)
385 fprintf(config.outfile, "</locales>\n");
387 fprintf(config.outfile, "\n");
389 if(U_FAILURE(status)) {
390 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
396 static void print_info(const struct config_t *p_config)
398 if (p_config->xmloutput)
399 fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
402 if ('c' == config.print[0])
403 print_icu_converters(&config);
404 else if ('l' == config.print[0])
405 print_icu_xml_locales(&config);
406 else if ('t' == config.print[0])
407 print_icu_transliterators(&config);
409 print_icu_converters(&config);
410 print_icu_xml_locales(&config);
411 print_icu_transliterators(&config);
414 if (p_config->xmloutput)
415 fprintf(config.outfile, "</icu>\n");
422 static void process_text_file(const struct config_t *p_config)
427 xmlDoc *doc = xmlParseFile(config.conffile);
428 xmlNode *xml_node = xmlDocGetRootElement(doc);
429 xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
431 long unsigned int token_count = 0;
432 long unsigned int line_count = 0;
434 UErrorCode status = U_ZERO_ERROR;
438 printf("Could not parse XML config file '%s' \n",
443 if (!xml_locale || !strlen((const char *) xml_locale))
446 config.chain = icu_chain_xml_config(xml_node, (uint8_t *) xml_locale, 0,
452 if (config.chain && U_SUCCESS(status))
455 printf("Could not set up ICU chain from config file '%s' \n",
462 if (p_config->xmloutput)
463 fprintf(config.outfile,
464 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
468 /* read input lines for processing */
469 while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
471 success = icu_chain_assign_cstr(config.chain, line, &status);
474 while (success && icu_chain_next_token(config.chain, &status)){
475 if (U_FAILURE(status))
479 if (p_config->xmloutput)
480 fprintf(config.outfile,
481 "<token id=\%lu\" line=\"%lu\""
482 " norm=\"%s\" display=\"%s\"/>\n",
485 icu_chain_token_norm(config.chain),
486 icu_chain_token_display(config.chain));
488 fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
491 icu_chain_token_norm(config.chain),
492 icu_chain_token_display(config.chain));
498 if (p_config->xmloutput)
499 fprintf(config.outfile,
503 icu_chain_destroy(config.chain);
509 #endif /* HAVE_ICU */
512 int main(int argc, char **argv)
517 read_params(argc, argv, &config);
519 if (config.conffile && strlen(config.conffile))
520 process_text_file(&config);
522 if (config.print && strlen(config.print))
527 printf("ICU not available on your system.\n"
528 "Please install libicu36-dev and icu-doc or similar, "
529 "re-configure and re-compile\n");
532 #endif /* HAVE_ICU */
541 * indent-tabs-mode: nil
543 * vim: shiftwidth=4 tabstop=8 expandtab