2 gcc -I/usr/include/libxml2 -lxml2 -o icu-xml-convert icu-xml-convert.c
15 //#include <yaz/xmalloc.h>
16 #include <yaz/options.h>
21 #include <unicode/ucnv.h>
22 #include <unicode/ustring.h>
26 /* commando line and config parameters */
27 static struct config_t {
31 struct icu_chain * chain;
38 void print_option_error(const struct config_t *p_config)
40 fprintf(stderr, "Calling error, valid options are :\n");
41 fprintf(stderr, "icu_chain_test\n"
42 " [-c (path/to/config/file.xml)]\n"
43 " [-p (a|c|l|t)] print ICU info \n"
47 "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n"
48 "./icu_chain_test -p c\n"
49 "./icu_chain_test -p l -x\n"
50 "./icu_chain_test -p t -x\n"
55 void read_params(int argc, char **argv, struct config_t *p_config){
59 /* set default parameters */
60 p_config->conffile[0] = 0;
61 p_config->print[0] = 0;
62 p_config->xmloutput = 0;
64 p_config->infile = stdin;
65 p_config->outfile = stdout;
67 /* set up command line parameters */
69 while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
74 strcpy(p_config->conffile, arg);
77 strcpy(p_config->print, arg);
80 p_config->xmloutput = 1;
83 print_option_error(p_config);
87 //p_config->infile = fopen("/etc/passwd", "r");
91 if ((!strlen(p_config->conffile)
92 && !strlen(p_config->print))
96 print_option_error(p_config);
100 /* UConverter *conv; */
101 /* conv = ucnv_open("utf-8", &status); */
102 /* assert(U_SUCCESS(status)); */
105 /* = ucnv_toUChars(conv, ustr16, 1024, */
106 /* (const char *) *xstr8, strlen((const char *) *xstr8), */
111 /* ucnv_fromUChars(conv, */
112 /* (char *) *xstr8, strlen((const char *) *xstr8), */
113 /* ustr16, *ustr16_len, */
115 /* ucnv_close(conv); */
118 static void print_icu_converters(const struct config_t *p_config)
123 count = ucnv_countAvailable();
124 if (p_config->xmloutput)
125 fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
126 count, ucnv_getDefaultName());
128 fprintf(config.outfile, "Available ICU converters: %d\n", count);
129 fprintf(config.outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName());
132 for(i=0;i<count;i++){
133 if (p_config->xmloutput)
134 fprintf(config.outfile, "<converter id=\"%s\"/>\n", ucnv_getAvailableName(i));
136 fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
139 if (p_config->xmloutput)
140 fprintf(config.outfile, "</converters>\n");
142 fprintf(config.outfile, "\n");
145 static void print_icu_transliterators(const struct config_t *p_config)
150 count = utrans_countAvailableIDs();
152 int32_t buf_cap = 128;
155 if (p_config->xmloutput)
156 fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
158 fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
162 utrans_getAvailableID(i, buf, buf_cap);
163 if (p_config->xmloutput)
164 fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
166 fprintf(config.outfile, " %s", buf);
169 if (p_config->xmloutput){
170 fprintf(config.outfile, "</transliterators>\n");
174 fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
175 " Pattern Description\n"
176 " Ranges [a-z] The lower case letters a through z\n"
177 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
178 " String [abc{def}] chars a, b and c, and string 'def'\n"
179 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
180 " Categories [:Letter:] Posix General Category 'Letter'.\n"
182 " Combination Example\n"
183 " Union [[:Greek:] [:letter:]]\n"
184 " Intersection [[:Greek:] & [:letter:]]\n"
185 " Set Complement [[:Greek:] - [:letter:]]\n"
186 " Complement [^[:Greek:] [:letter:]]\n"
188 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
191 " [:Punctuation:] Any-Remove\n"
192 " [:Cased-Letter:] Any-Upper\n"
193 " [:Control:] Any-Remove\n"
194 " [:Decimal_Number:] Any-Remove\n"
195 " [:Final_Punctuation:] Any-Remove\n"
196 " [:Georgian:] Any-Upper\n"
197 " [:Katakana:] Any-Remove\n"
198 " [:Arabic:] Any-Remove\n"
199 " [:Punctuation:] Remove\n"
200 " [[:Punctuation:]-[.,]] Remove\n"
201 " [:Line_Separator:] Any-Remove\n"
202 " [:Math_Symbol:] Any-Remove\n"
203 " Lower; [:^Letter:] Remove (word tokenization)\n"
204 " [:^Number:] Remove (numeric tokenization)\n"
205 " [:^Katagana:] Remove (remove everything except Katagana)\n"
206 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
207 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
208 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
209 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
211 "see http://icu.sourceforge.net/userguide/Transform.html\n"
212 " http://www.unicode.org/Public/UNIDATA/UCD.html\n"
213 " http://icu.sourceforge.net/userguide/Transform.html\n"
214 " http://icu.sourceforge.net/userguide/TransformRule.html\n"
218 fprintf(config.outfile, "\n\n");
223 static void print_icu_xml_locales(const struct config_t *p_config)
227 UErrorCode status = U_ZERO_ERROR;
230 int32_t keyword_len = 0;
231 char keyword_str[128];
232 int32_t keyword_str_len = 0;
235 int32_t language_len = 0;
237 int32_t lang_str_len = 0;
240 int32_t script_len = 0;
241 char script_str[128];
242 int32_t script_str_len = 0;
245 int32_t location_len = 0;
246 char location_str[128];
247 int32_t location_str_len = 0;
250 int32_t variant_len = 0;
251 char variant_str[128];
252 int32_t variant_str_len = 0;
255 int32_t name_len = 0;
257 int32_t name_str_len = 0;
260 int32_t localname_len = 0;
261 char localname_str[128];
262 int32_t localname_str_len = 0;
264 count = uloc_countAvailable() ;
266 if (p_config->xmloutput){
268 fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
269 count, uloc_getDefault(), ucol_countAvailable());
276 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
280 u_strToUTF8(keyword_str, 128, &keyword_str_len,
281 keyword, keyword_len,
286 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
290 u_strToUTF8(lang_str, 128, &lang_str_len,
291 language, language_len,
296 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
300 u_strToUTF8(script_str, 128, &script_str_len,
305 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
309 u_strToUTF8(location_str, 128, &location_str_len,
310 location, location_len,
314 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
318 u_strToUTF8(variant_str, 128, &variant_str_len,
319 variant, variant_len,
323 = uloc_getDisplayName(uloc_getAvailable(i), "en",
327 u_strToUTF8(name_str, 128, &name_str_len,
332 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
336 u_strToUTF8(localname_str, 128, &localname_str_len,
337 localname, localname_len,
341 if (p_config->xmloutput){
342 fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
343 /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
344 /* if (strlen(keyword_str)) */
345 /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
346 /* if (ucol_getAvailable(i)) */
347 /* fprintf(config.outfile, " collation=\"1\""); */
348 if (strlen(lang_str))
349 fprintf(config.outfile, " language=\"%s\"", lang_str);
350 if (strlen(script_str))
351 fprintf(config.outfile, " script=\"%s\"", script_str);
352 if (strlen(location_str))
353 fprintf(config.outfile, " location=\"%s\"", location_str);
354 if (strlen(variant_str))
355 fprintf(config.outfile, " variant=\"%s\"", variant_str);
356 if (strlen(name_str))
357 fprintf(config.outfile, " name=\"%s\"", name_str);
358 if (strlen(localname_str))
359 fprintf(config.outfile, " localname=\"%s\"", localname_str);
360 fprintf(config.outfile, ">");
361 if (strlen(localname_str))
362 fprintf(config.outfile, "%s", localname_str);
363 fprintf(config.outfile, "</locale>\n");
365 else if (1 == p_config->xmloutput){
366 fprintf(config.outfile, "%s", uloc_getAvailable(i));
367 fprintf(config.outfile, " | ");
368 if (strlen(name_str))
369 fprintf(config.outfile, "%s", name_str);
370 fprintf(config.outfile, " | ");
371 if (strlen(localname_str))
372 fprintf(config.outfile, "%s", localname_str);
373 fprintf(config.outfile, "\n");
376 fprintf(config.outfile, "%s ", uloc_getAvailable(i));
378 if (p_config->xmloutput)
379 fprintf(config.outfile, "</locales>\n");
381 fprintf(config.outfile, "\n");
383 if(U_FAILURE(status)) {
384 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
390 static void print_info(const struct config_t *p_config)
392 if (p_config->xmloutput)
393 fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
396 if ('c' == config.print[0])
397 print_icu_converters(&config);
398 else if ('l' == config.print[0])
399 print_icu_xml_locales(&config);
400 else if ('t' == config.print[0])
401 print_icu_transliterators(&config);
403 print_icu_converters(&config);
404 print_icu_xml_locales(&config);
405 print_icu_transliterators(&config);
408 if (p_config->xmloutput)
409 fprintf(config.outfile, "</icu>\n");
416 static void process_text_file(const struct config_t *p_config)
422 xmlDoc *doc = xmlParseFile(config.conffile);
423 xmlNode *xml_node = xmlDocGetRootElement(doc);
425 long unsigned int token_count = 0;
426 long unsigned int line_count = 0;
428 UErrorCode status = U_ZERO_ERROR;
432 config.chain = icu_chain_xml_config(xml_node, &status);
434 if (config.chain && U_SUCCESS(status))
437 if (p_config->xmloutput)
438 fprintf(config.outfile,
439 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
443 // read input lines for processing
444 while ((line_len = getline(&line, &line_cap, config.infile)) != -1) {
445 success = icu_chain_assign_cstr(config.chain, line, &status);
448 while (success && icu_chain_next_token(config.chain, &status)){
449 if (U_FAILURE(status))
453 if (p_config->xmloutput)
454 fprintf(config.outfile,
455 "<token id=\%lu\" line=\"%lu\""
456 " norm=\"%s\" display=\"%s\"/>\n",
459 icu_chain_get_norm(config.chain),
460 icu_chain_get_display(config.chain));
462 fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
465 icu_chain_get_norm(config.chain),
466 icu_chain_get_display(config.chain));
472 if (p_config->xmloutput)
473 fprintf(config.outfile,
477 icu_chain_destroy(config.chain);
486 int main(int argc, char **argv)
491 read_params(argc, argv, &config);
493 if (config.conffile && strlen(config.conffile))
494 process_text_file(&config);
496 if (config.print && strlen(config.print))
501 printf("ICU not available on your system.\n"
502 "Please install libicu36-dev and icu-doc or similar, "
503 "re-configure and re-compile\n");
515 * indent-tabs-mode: nil
517 * vim: shiftwidth=4 tabstop=8 expandtab