From 22ab526ca79529370276260b37538c676b3816ee Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Thu, 10 May 2007 12:11:42 +0000 Subject: [PATCH] started ICU transliterator integration for more complex normalization rules than lowercasing --- src/test_icu_I18N.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 151ae09..ce795e6 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.13 2007-05-10 11:53:47 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.14 2007-05-10 12:11:42 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -314,9 +314,89 @@ void test_icu_I18N_sortmap(int argc, char **argv) // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 -void test_icu_I18N_normmap(int argc, char **argv) +void test_icu_I18N_transliterator(int argc, char **argv) { + /* setting up transliterator */ + +#if 0 + + UErrorCode status = U_ZERO_ERROR; + UParseError parse_error[256]; + + int32_t id_cap = 256; + UChar id[256]; + id[0] = 0; + + trans = utrans_openU(id, id_len, UTRANS_FORWARD, + 0, 0, parse_error, &status); + + + if(U_FAILURE(status)) { + printf("Parse Error: line %d offset %d \n", + parse_error->line, parse_error->offset); + } + icu_check_status(status); + + + int32_t ustr16_lim = *ustr16_len; + /* Transliterate a segment of a UChar* string */ + + utrans_transUChars (trans, ustr16, &*ustr16_len, + ustr16_cap, + 0, &ustr16_lim, &status); + + utrans_close (trans); + + printf("\n\nUnicode Set Patterns:\n" + " Pattern Description\n" + " Ranges [a-z] The lower case letters a through z\n" + " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" + " String [abc{def}] chars a, b and c, and string 'def'\n +" + " Categories [\\p{Letter}] Perl General Category 'Letter'.\n +" + " Categories [:Letter:] Posix General Category 'Letter'.\n" + "\n" + " Combination Example\n" + " Union [[:Greek:] [:letter:]]\n" + " Intersection [[:Greek:] & [:letter:]]\n" + " Set Complement [[:Greek:] - [:letter:]]\n" + " Complement [^[:Greek:] [:letter:]]\n" + "\n" + "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n" + "\n" + "Examples:\n" + " [:Punctuation:] Any-Remove\n" + " [:Cased-Letter:] Any-Upper\n" + " [:Control:] Any-Remove\n" + " [:Decimal_Number:] Any-Remove\n" + " [:Final_Punctuation:] Any-Remove\n" + " [:Georgian:] Any-Upper\n" + " [:Katakana:] Any-Remove\n" + " [:Arabic:] Any-Remove\n" + " [:Punctuation:] Remove\n" + " [[:Punctuation:]-[.,]] Remove\n" + " [:Line_Separator:] Any-Remove\n" + " [:Math_Symbol:] Any-Remove\n" + " Lower; [:^Letter:] Remove (word tokenization)\n" + " [:^Number:] Remove (numeric tokenization)\n" + " [:^Katagana:] Remove (remove everything except Katagana)\n" + " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization +)\n" + " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from ch +aracters)\n" + " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transform +s latin and katagana to hiragana)\n" + " [[:separator:][:start punctuation:][:initial punctuation:]] Rem +ove \n" + "\n" + "see http://icu.sourceforge.net/userguide/Transform.html\n" + " http://www.unicode.org/Public/UNIDATA/UCD.html\n" + " http://icu.sourceforge.net/userguide/Transform.html\n" + " http://icu.sourceforge.net/userguide/TransformRule.html\n" + ); +#endif } @@ -430,7 +510,7 @@ int main(int argc, char **argv) //test_icu_I18N_casemap_failures(argc, argv); test_icu_I18N_casemap(argc, argv); test_icu_I18N_sortmap(argc, argv); - test_icu_I18N_normmap(argc, argv); + test_icu_I18N_transliterator(argc, argv); test_icu_I18N_tokenizer(argc, argv); #else // HAVE_ICU -- 1.7.10.4