src/icu_I18N.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2008 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #define USE_TIMING 0
  25 #if USE_TIMING
  26 #include <yaz/timing.h>
  27 #endif
  28
  29
  30 #ifdef HAVE_ICU
  31 #include "icu_I18N.h"
  32
  33 #include <yaz/log.h>
  34
  35 #include <string.h>
  36 #include <stdlib.h>
  37 #include <stdio.h>
  38
  39 #include <unicode/ustring.h>  /* some more string fcns*/
  40 #include <unicode/uchar.h>    /* char names           */
  41
  42
  43 //#include <unicode/ustdio.h>
  44 //#include <unicode/utypes.h>   /* Basic ICU data types */
  45 #include <unicode/ucol.h>
  46 //#include <unicode/ucnv.h>     /* C   Converter API    */
  47 //#include <unicode/uloc.h>
  48 //#include <unicode/ubrk.h>
  49 /* #include <unicode/unistr.h> */
  50
  51
  52
  53
  54 int icu_check_status (UErrorCode status)
  55 {
  56     if(U_FAILURE(status)){
  57         yaz_log(YLOG_WARN,
  58                 "ICU: %d %s\n", status, u_errorName(status));
  59         return 0;
  60     }
  61     return 1;
  62
  63 }
  64
  65
  66
  67 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  68 {
  69     struct icu_buf_utf16 * buf16
  70         = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
  71
  72     buf16->utf16 = 0;
  73     buf16->utf16_len = 0;
  74     buf16->utf16_cap = 0;
  75
  76     if (capacity > 0){
  77         buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  78         buf16->utf16[0] = (UChar) 0;
  79         buf16->utf16_cap = capacity;
  80     }
  81     return buf16;
  82 };
  83
  84 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  85                                             size_t capacity)
  86 {
  87     if (buf16){
  88         if (capacity >  0){
  89             if (0 == buf16->utf16)
  90                 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  91             else
  92                 buf16->utf16
  93                     = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
  94             buf16->utf16[0] = (UChar) 0;
  95             buf16->utf16_len = 0;
  96             buf16->utf16_cap = capacity;
  97         }
  98         else {
  99             if (buf16->utf16)
 100                 free(buf16->utf16);
 101             buf16->utf16 = 0;
 102             buf16->utf16_len = 0;
 103             buf16->utf16_cap = 0;
 104         }
 105     }
 106
 107     return buf16;
 108 };
 109
 110
 111 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
 112                                           struct icu_buf_utf16 * src16)
 113 {
 114     if(!dest16 || !src16
 115        || dest16 == src16)
 116         return 0;
 117
 118     if (dest16->utf16_cap < src16->utf16_len)
 119         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 120
 121     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 122     dest16->utf16_len = src16->utf16_len;
 123
 124     return dest16;
 125 };
 126
 127
 128 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 129 {
 130     if (buf16){
 131         if (buf16->utf16)
 132             free(buf16->utf16);
 133         free(buf16);
 134     }
 135 };
 136
 137
 138
 139
 140
 141
 142 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 143 {
 144     struct icu_buf_utf8 * buf8
 145         = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
 146
 147     buf8->utf8 = 0;
 148     buf8->utf8_len = 0;
 149     buf8->utf8_cap = 0;
 150
 151     if (capacity > 0){
 152         buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 153         buf8->utf8[0] = (uint8_t) 0;
 154         buf8->utf8_cap = capacity;
 155     }
 156     return buf8;
 157 };
 158
 159
 160
 161 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 162                                           size_t capacity)
 163 {
 164     if (buf8){
 165         if (capacity >  0){
 166             if (0 == buf8->utf8)
 167                 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 168             else
 169                 buf8->utf8
 170                     = (uint8_t *) realloc(buf8->utf8,
 171                                           sizeof(uint8_t) * capacity);
 172             buf8->utf8_cap = capacity;
 173         }
 174         else {
 175             if (buf8->utf8)
 176                 free(buf8->utf8);
 177             buf8->utf8 = 0;
 178             buf8->utf8_len = 0;
 179             buf8->utf8_cap = 0;
 180         }
 181     }
 182
 183     return buf8;
 184 };
 185
 186
 187 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 188                                           struct icu_buf_utf8 * src8)
 189 {
 190     if(!dest8 || !src8
 191        || dest8 == src8)
 192         return 0;
 193
 194
 195     if (dest8->utf8_cap < src8->utf8_len)
 196         icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
 197
 198     strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
 199
 200     return dest8;
 201 };
 202
 203
 204 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
 205 {
 206     if (!src8 || src8->utf8_len == 0)
 207         return "";
 208     if (src8->utf8_len == src8->utf8_cap)
 209         src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
 210     src8->utf8[src8->utf8_len] = '\0';
 211     return (const char *) src8->utf8;
 212 }
 213
 214
 215 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 216 {
 217     if (buf8){
 218         if (buf8->utf8)
 219             free(buf8->utf8);
 220         free(buf8);
 221     }
 222 };
 223
 224
 225
 226 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 227                                struct icu_buf_utf8 * src8,
 228                                UErrorCode * status)
 229 {
 230     int32_t utf16_len = 0;
 231
 232     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 233                   &utf16_len,
 234                   (const char *) src8->utf8, src8->utf8_len, status);
 235
 236     // check for buffer overflow, resize and retry
 237     if (*status == U_BUFFER_OVERFLOW_ERROR
 238         //|| dest16->utf16_len > dest16->utf16_cap
 239         ){
 240         icu_buf_utf16_resize(dest16, utf16_len * 2);
 241         *status = U_ZERO_ERROR;
 242         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 243                       &utf16_len,
 244                       (const char *) src8->utf8, src8->utf8_len, status);
 245     }
 246
 247     //if (*status != U_BUFFER_OVERFLOW_ERROR
 248     if (U_SUCCESS(*status)
 249         && utf16_len <= dest16->utf16_cap)
 250         dest16->utf16_len = utf16_len;
 251     else {
 252         dest16->utf16[0] = (UChar) 0;
 253         dest16->utf16_len = 0;
 254     }
 255
 256     return *status;
 257 };
 258
 259
 260
 261 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 262                                     const char * src8cstr,
 263                                     UErrorCode * status)
 264 {
 265     size_t src8cstr_len = 0;
 266     int32_t utf16_len = 0;
 267
 268     src8cstr_len = strlen(src8cstr);
 269
 270     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 271                   &utf16_len,
 272                   src8cstr, src8cstr_len, status);
 273
 274     // check for buffer overflow, resize and retry
 275     if (*status == U_BUFFER_OVERFLOW_ERROR
 276         //|| dest16->utf16_len > dest16->utf16_cap
 277         ){
 278         icu_buf_utf16_resize(dest16, utf16_len * 2);
 279         *status = U_ZERO_ERROR;
 280         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 281                       &utf16_len,
 282                       src8cstr, src8cstr_len, status);
 283     }
 284
 285     //  if (*status != U_BUFFER_OVERFLOW_ERROR
 286     if (U_SUCCESS(*status)
 287         && utf16_len <= dest16->utf16_cap)
 288         dest16->utf16_len = utf16_len;
 289     else {
 290         dest16->utf16[0] = (UChar) 0;
 291         dest16->utf16_len = 0;
 292     }
 293
 294     return *status;
 295 };
 296
 297
 298
 299
 300 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 301                              struct icu_buf_utf16 * src16,
 302                              UErrorCode * status)
 303 {
 304     int32_t utf8_len = 0;
 305
 306     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 307                 &utf8_len,
 308                 src16->utf16, src16->utf16_len, status);
 309
 310     // check for buffer overflow, resize and retry
 311     if (*status == U_BUFFER_OVERFLOW_ERROR
 312         //|| dest8->utf8_len > dest8->utf8_cap
 313         ){
 314         icu_buf_utf8_resize(dest8, utf8_len * 2);
 315         *status = U_ZERO_ERROR;
 316         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 317                     &utf8_len,
 318                     src16->utf16, src16->utf16_len, status);
 319
 320     }
 321
 322     //if (*status != U_BUFFER_OVERFLOW_ERROR
 323     if (U_SUCCESS(*status)
 324         && utf8_len <= dest8->utf8_cap)
 325         dest8->utf8_len = utf8_len;
 326     else {
 327         dest8->utf8[0] = (uint8_t) 0;
 328         dest8->utf8_len = 0;
 329     }
 330
 331     return *status;
 332 };
 333
 334
 335
 336 struct icu_casemap * icu_casemap_create(const char *locale, char action,
 337                                         UErrorCode *status)
 338 {
 339     struct icu_casemap * casemap
 340         = (struct icu_casemap *) malloc(sizeof(struct icu_casemap));
 341     strcpy(casemap->locale, locale);
 342     casemap->action = action;
 343
 344     switch(casemap->action) {
 345     case 'l':
 346         break;
 347     case 'u':
 348         break;
 349     case 't':
 350         break;
 351     case 'f':
 352         break;
 353     default:
 354         icu_casemap_destroy(casemap);
 355         return 0;
 356     }
 357
 358     return casemap;
 359 };
 360
 361 void icu_casemap_destroy(struct icu_casemap * casemap)
 362 {
 363     if (casemap)
 364         free(casemap);
 365 };
 366
 367
 368 int icu_casemap_casemap(struct icu_casemap * casemap,
 369                         struct icu_buf_utf16 * dest16,
 370                         struct icu_buf_utf16 * src16,
 371                         UErrorCode *status)
 372 {
 373     if(!casemap)
 374         return 0;
 375
 376     return icu_utf16_casemap(dest16, src16,
 377                              casemap->locale, casemap->action, status);
 378 };
 379
 380
 381 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 382                       struct icu_buf_utf16 * src16,
 383                       const char *locale, char action,
 384                       UErrorCode *status)
 385 {
 386     int32_t dest16_len = 0;
 387
 388     switch(action) {
 389     case 'l':
 390         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 391                                   src16->utf16, src16->utf16_len,
 392                                   locale, status);
 393         break;
 394     case 'u':
 395         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 396                                   src16->utf16, src16->utf16_len,
 397                                   locale, status);
 398         break;
 399     case 't':
 400         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 401                                   src16->utf16, src16->utf16_len,
 402                                   0, locale, status);
 403         break;
 404     case 'f':
 405         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 406                                    src16->utf16, src16->utf16_len,
 407                                    U_FOLD_CASE_DEFAULT, status);
 408         break;
 409
 410     default:
 411         return U_UNSUPPORTED_ERROR;
 412         break;
 413     }
 414
 415     // check for buffer overflow, resize and retry
 416     if (*status == U_BUFFER_OVERFLOW_ERROR
 417         && dest16 != src16        // do not resize if in-place conversion
 418         //|| dest16_len > dest16->utf16_cap
 419         ){
 420         icu_buf_utf16_resize(dest16, dest16_len * 2);
 421         *status = U_ZERO_ERROR;
 422
 423
 424         switch(action) {
 425         case 'l':
 426             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 427                                       src16->utf16, src16->utf16_len,
 428                                       locale, status);
 429             break;
 430         case 'u':
 431             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 432                                       src16->utf16, src16->utf16_len,
 433                                       locale, status);
 434             break;
 435         case 't':
 436             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 437                                       src16->utf16, src16->utf16_len,
 438                                       0, locale, status);
 439             break;
 440         case 'f':
 441             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 442                                        src16->utf16, src16->utf16_len,
 443                                        U_FOLD_CASE_DEFAULT, status);
 444             break;
 445
 446         default:
 447             return U_UNSUPPORTED_ERROR;
 448             break;
 449         }
 450     }
 451
 452     if (U_SUCCESS(*status)
 453         && dest16_len <= dest16->utf16_cap)
 454         dest16->utf16_len = dest16_len;
 455     else {
 456         dest16->utf16[0] = (UChar) 0;
 457         dest16->utf16_len = 0;
 458     }
 459
 460     return *status;
 461 };
 462
 463
 464
 465 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 466                                    struct icu_buf_utf8 * dest8,
 467                                    struct icu_buf_utf16 * src16,
 468                                    UErrorCode * status)
 469 {
 470
 471     int32_t sortkey_len = 0;
 472
 473     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 474                                   dest8->utf8, dest8->utf8_cap);
 475
 476     // check for buffer overflow, resize and retry
 477     if (sortkey_len > dest8->utf8_cap) {
 478         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 479         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 480                                       dest8->utf8, dest8->utf8_cap);
 481     }
 482
 483     if (U_SUCCESS(*status)
 484         && sortkey_len > 0)
 485         dest8->utf8_len = sortkey_len;
 486     else {
 487         dest8->utf8[0] = (UChar) 0;
 488         dest8->utf8_len = 0;
 489     }
 490
 491     return sortkey_len;
 492 };
 493
 494
 495
 496 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 497                                             UErrorCode *status)
 498 {
 499     struct icu_tokenizer * tokenizer
 500         = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
 501
 502     strcpy(tokenizer->locale, locale);
 503     tokenizer->action = action;
 504     tokenizer->bi = 0;
 505     tokenizer->buf16 = 0;
 506     tokenizer->token_count = 0;
 507     tokenizer->token_id = 0;
 508     tokenizer->token_start = 0;
 509     tokenizer->token_end = 0;
 510
 511
 512     switch(tokenizer->action) {
 513     case 'l':
 514         tokenizer->bi
 515             = ubrk_open(UBRK_LINE, tokenizer->locale,
 516                         0, 0, status);
 517         break;
 518     case 's':
 519         tokenizer->bi
 520             = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
 521                         0, 0, status);
 522         break;
 523     case 'w':
 524         tokenizer->bi
 525             = ubrk_open(UBRK_WORD, tokenizer->locale,
 526                         0, 0, status);
 527         break;
 528     case 'c':
 529         tokenizer->bi
 530             = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
 531                         0, 0, status);
 532         break;
 533     case 't':
 534         tokenizer->bi
 535             = ubrk_open(UBRK_TITLE, tokenizer->locale,
 536                         0, 0, status);
 537         break;
 538     default:
 539         *status = U_UNSUPPORTED_ERROR;
 540         return 0;
 541         break;
 542     }
 543
 544     // ICU error stuff is a very  funny business
 545     if (U_SUCCESS(*status))
 546         return tokenizer;
 547
 548     // freeing if failed
 549     icu_tokenizer_destroy(tokenizer);
 550     return 0;
 551 };
 552
 553 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 554 {
 555     if (tokenizer) {
 556         if (tokenizer->bi)
 557             ubrk_close(tokenizer->bi);
 558         free(tokenizer);
 559     }
 560 };
 561
 562 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 563                          struct icu_buf_utf16 * src16,
 564                          UErrorCode *status)
 565 {
 566     if (!tokenizer || !tokenizer->bi || !src16)
 567         return 0;
 568
 569
 570     tokenizer->buf16 = src16;
 571     tokenizer->token_count = 0;
 572     tokenizer->token_id = 0;
 573     tokenizer->token_start = 0;
 574     tokenizer->token_end = 0;
 575
 576     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 577
 578
 579     if (U_FAILURE(*status))
 580         return 0;
 581
 582     return 1;
 583 };
 584
 585 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 586                          struct icu_buf_utf16 * tkn16,
 587                          UErrorCode *status)
 588 {
 589     int32_t tkn_start = 0;
 590     int32_t tkn_end = 0;
 591     int32_t tkn_len = 0;
 592
 593
 594     if (!tokenizer || !tokenizer->bi
 595         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 596         return 0;
 597
 598     // never change tokenizer->buf16 and keep always invariant
 599     // 0 <= tokenizer->token_start
 600     //   <= tokenizer->token_end
 601     //   <= tokenizer->buf16->utf16_len
 602     // returns length of token
 603
 604     if (0 == tokenizer->token_end) // first call
 605         tkn_start = ubrk_first(tokenizer->bi);
 606     else //successive calls
 607         tkn_start = tokenizer->token_end;
 608
 609     // get next position
 610     tkn_end = ubrk_next(tokenizer->bi);
 611
 612     // repairing invariant at end of ubrk, which is UBRK_DONE = -1
 613     if (UBRK_DONE == tkn_end)
 614         tkn_end = tokenizer->buf16->utf16_len;
 615
 616     // copy out if everything is well
 617     if(U_FAILURE(*status))
 618         return 0;
 619
 620     // everything OK, now update internal state
 621     tkn_len = tkn_end - tkn_start;
 622
 623     if (0 < tkn_len){
 624         tokenizer->token_count++;
 625         tokenizer->token_id++;
 626     } else {
 627         tokenizer->token_id = 0;
 628     }
 629     tokenizer->token_start = tkn_start;
 630     tokenizer->token_end = tkn_end;
 631
 632
 633     // copying into token buffer if it exists
 634     if (tkn16){
 635         if (tkn16->utf16_cap < tkn_len)
 636             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 637
 638         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 639                   tkn_len);
 640
 641         tkn16->utf16_len = tkn_len;
 642     }
 643
 644     return tkn_len;
 645 }
 646
 647
 648 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 649 {
 650     return tokenizer->token_id;
 651 };
 652
 653 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 654 {
 655     return tokenizer->token_start;
 656 };
 657
 658 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 659 {
 660     return tokenizer->token_end;
 661 };
 662
 663 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 664 {
 665     return (tokenizer->token_end - tokenizer->token_start);
 666 };
 667
 668 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 669 {
 670     return tokenizer->token_count;
 671 };
 672
 673
 674
 675 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
 676                                               UErrorCode *status)
 677 {
 678
 679     struct icu_normalizer * normalizer
 680         = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
 681
 682     normalizer->action = action;
 683     normalizer->trans = 0;
 684     normalizer->rules16 =  icu_buf_utf16_create(0);
 685     icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
 686
 687     switch(normalizer->action) {
 688     case 'f':
 689         normalizer->trans
 690             = utrans_openU(normalizer->rules16->utf16,
 691                            normalizer->rules16->utf16_len,
 692                            UTRANS_FORWARD,
 693                            0, 0,
 694                            normalizer->parse_error, status);
 695         // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
 696         break;
 697     case 'r':
 698         normalizer->trans
 699             = utrans_openU(normalizer->rules16->utf16,
 700                            normalizer->rules16->utf16_len,
 701                            UTRANS_REVERSE ,
 702                            0, 0,
 703                            normalizer->parse_error, status);
 704         // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
 705         break;
 706     default:
 707         *status = U_UNSUPPORTED_ERROR;
 708         return 0;
 709         break;
 710     }
 711
 712     if (U_SUCCESS(*status))
 713         return normalizer;
 714
 715     // freeing if failed
 716     icu_normalizer_destroy(normalizer);
 717     return 0;
 718 };
 719
 720
 721 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
 722     if (normalizer) {
 723         if (normalizer->rules16)
 724             icu_buf_utf16_destroy(normalizer->rules16);
 725         if (normalizer->trans)
 726         {
 727             // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
 728             utrans_close(normalizer->trans);
 729         }
 730         free(normalizer);
 731     }
 732 };
 733
 734
 735
 736 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 737                              struct icu_buf_utf16 * dest16,
 738                              struct icu_buf_utf16 * src16,
 739                              UErrorCode *status)
 740 {
 741     if (!normalizer || !normalizer->trans || !src16 || !dest16)
 742         return 0;
 743
 744     if (!icu_buf_utf16_copy(dest16, src16))
 745         return 0;
 746
 747     utrans_transUChars (normalizer->trans,
 748                         dest16->utf16, &(dest16->utf16_len),
 749                         dest16->utf16_cap,
 750                         0, &(src16->utf16_len), status);
 751
 752     if (U_FAILURE(*status)){
 753         dest16->utf16[0] = (UChar) 0;
 754         dest16->utf16_len = 0;
 755     }
 756
 757     return dest16->utf16_len;
 758 }
 759
 760
 761
 762
 763 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 764                                               enum icu_chain_step_type type,
 765                                               const uint8_t * rule,
 766                                               struct icu_buf_utf16 * buf16,
 767                                               UErrorCode *status)
 768 {
 769     struct icu_chain_step * step = 0;
 770
 771     if(!chain || !type || !rule)
 772         return 0;
 773
 774     step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
 775
 776     step->type = type;
 777
 778     step->buf16 = buf16;
 779
 780     // create auxilary objects
 781     switch(step->type) {
 782     case ICU_chain_step_type_display:
 783         break;
 784     case ICU_chain_step_type_index:
 785         break;
 786     case ICU_chain_step_type_sortkey:
 787         break;
 788     case ICU_chain_step_type_casemap:
 789         step->u.casemap = icu_casemap_create((char *) chain->locale,
 790                                              (char) rule[0], status);
 791         break;
 792     case ICU_chain_step_type_normalize:
 793         step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
 794         break;
 795     case ICU_chain_step_type_tokenize:
 796         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 797                                                  (char) rule[0], status);
 798         break;
 799     default:
 800         break;
 801     }
 802
 803     return step;
 804 };
 805
 806
 807 void icu_chain_step_destroy(struct icu_chain_step * step){
 808
 809     if (!step)
 810         return;
 811
 812     icu_chain_step_destroy(step->previous);
 813
 814     switch(step->type) {
 815     case ICU_chain_step_type_display:
 816         break;
 817     case ICU_chain_step_type_index:
 818         break;
 819     case ICU_chain_step_type_sortkey:
 820         break;
 821     case ICU_chain_step_type_casemap:
 822         icu_casemap_destroy(step->u.casemap);
 823         icu_buf_utf16_destroy(step->buf16);
 824         break;
 825     case ICU_chain_step_type_normalize:
 826         icu_normalizer_destroy(step->u.normalizer);
 827         icu_buf_utf16_destroy(step->buf16);
 828         break;
 829     case ICU_chain_step_type_tokenize:
 830         icu_tokenizer_destroy(step->u.tokenizer);
 831         icu_buf_utf16_destroy(step->buf16);
 832         break;
 833     default:
 834         break;
 835     }
 836     free(step);
 837 };
 838
 839
 840
 841 struct icu_chain * icu_chain_create(const uint8_t * identifier,
 842                                     const uint8_t * locale)
 843 {
 844
 845     struct icu_chain * chain
 846         = (struct icu_chain *) malloc(sizeof(struct icu_chain));
 847
 848     strncpy((char *) chain->identifier, (const char *) identifier, 128);
 849     chain->identifier[128 - 1] = '\0';
 850     strncpy((char *) chain->locale, (const char *) locale, 16);
 851     chain->locale[16 - 1] = '\0';
 852
 853     chain->token_count = 0;
 854
 855     chain->display8 = icu_buf_utf8_create(0);
 856     chain->norm8 = icu_buf_utf8_create(0);
 857     chain->sort8 = icu_buf_utf8_create(0);
 858
 859     chain->src16 = icu_buf_utf16_create(0);
 860
 861     chain->steps = 0;
 862
 863     return chain;
 864 };
 865
 866
 867 void icu_chain_destroy(struct icu_chain * chain)
 868 {
 869     if (chain){
 870         icu_buf_utf8_destroy(chain->display8);
 871         icu_buf_utf8_destroy(chain->norm8);
 872         icu_buf_utf8_destroy(chain->sort8);
 873
 874         icu_buf_utf16_destroy(chain->src16);
 875
 876         icu_chain_step_destroy(chain->steps);
 877         free(chain);
 878     }
 879 };
 880
 881
 882
 883 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
 884                                         UErrorCode * status){
 885
 886     xmlNode *node = 0;
 887     struct icu_chain * chain = 0;
 888     xmlChar *xml_id = 0;
 889     xmlChar *xml_locale = 0;
 890
 891     if (!xml_node
 892         ||xml_node->type != XML_ELEMENT_NODE
 893         || strcmp((const char *) xml_node->name, "icu_chain"))
 894
 895         return 0;
 896
 897     xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
 898     xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
 899
 900     if (!xml_id || !strlen((const char *) xml_id)
 901         || !xml_locale || !strlen((const char *) xml_locale))
 902         return 0;
 903
 904     chain = icu_chain_create((const uint8_t *) xml_id,
 905                              (const uint8_t *) xml_locale);
 906
 907     xmlFree(xml_id);
 908     xmlFree(xml_locale);
 909     if (!chain)
 910         return 0;
 911
 912     for (node = xml_node->children; node; node = node->next)
 913     {
 914         xmlChar *xml_rule = 0;
 915         struct icu_chain_step * step = 0;
 916         if (node->type != XML_ELEMENT_NODE)
 917             continue;
 918
 919         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 920
 921         if (!strcmp((const char *) node->name,
 922                     (const char *) "casemap")){
 923             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 924                                          (const uint8_t *) xml_rule, status);
 925         }
 926         else if (!strcmp((const char *) node->name,
 927                          (const char *) "normalize")){
 928             step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 929                                          (const uint8_t *) xml_rule, status);
 930         }
 931         else if (!strcmp((const char *) node->name,
 932                          (const char *) "tokenize")){
 933             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 934                                          (const uint8_t *) xml_rule, status);
 935         }
 936         else if (!strcmp((const char *) node->name,
 937                          (const char *) "display")){
 938             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 939                                          (const uint8_t *) "", status);
 940         }
 941         else if (!strcmp((const char *) node->name,
 942                          (const char *) "index")){
 943             step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
 944                                          (const uint8_t *) "", status);
 945         }
 946         else if (!strcmp((const char *) node->name,
 947                          (const char *) "sortkey")){
 948             step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey,
 949                                          (const uint8_t *) "", status);
 950         }
 951
 952         xmlFree(xml_rule);
 953         if (!step || U_FAILURE(*status)){
 954             icu_chain_destroy(chain);
 955             return 0;
 956         }
 957
 958
 959     }
 960
 961     return chain;
 962 };
 963
 964
 965
 966 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 967                                               enum icu_chain_step_type type,
 968                                               const uint8_t * rule,
 969                                               UErrorCode *status)
 970 {
 971     struct icu_chain_step * step = 0;
 972     struct icu_buf_utf16 * src16 = 0;
 973     struct icu_buf_utf16 * buf16 = 0;
 974
 975     if (!chain || !type || !rule)
 976         return 0;
 977
 978     // assign utf16 src buffers as needed
 979     if (chain->steps && chain->steps->buf16)
 980         src16 = chain->steps->buf16;
 981     else if (chain->src16)
 982         src16 = chain->src16;
 983     else
 984         return 0;
 985
 986
 987     // create utf16 destination buffers as needed, or
 988     switch(type) {
 989     case ICU_chain_step_type_display:
 990         buf16 = src16;
 991         break;
 992     case ICU_chain_step_type_index:
 993         buf16 = src16;
 994         break;
 995     case ICU_chain_step_type_sortkey:
 996         buf16 = src16;
 997         break;
 998     case ICU_chain_step_type_casemap:
 999         buf16 = icu_buf_utf16_create(0);
1000         break;
1001     case ICU_chain_step_type_normalize:
1002         buf16 = icu_buf_utf16_create(0);
1003         break;
1004     case ICU_chain_step_type_tokenize:
1005         buf16 = icu_buf_utf16_create(0);
1006         break;
1007     default:
1008         break;
1009     }
1010
1011     // create actual chain step with this buffer
1012     step = icu_chain_step_create(chain, type, rule, buf16, status);
1013
1014     step->previous = chain->steps;
1015     chain->steps = step;
1016
1017     return step;
1018 };
1019
1020
1021 int icu_chain_step_next_token(struct icu_chain * chain,
1022                               struct icu_chain_step * step,
1023                               UErrorCode *status)
1024 {
1025     struct icu_buf_utf16 * src16 = 0;
1026
1027     //printf("icu_chain_step_next_token %d\n", (int) step);
1028
1029     if (!chain || !chain->src16 || !step || !step->more_tokens)
1030         return 0;
1031
1032     // assign utf16 src buffers as neeed, advance in previous steps
1033     // tokens until non-zero token met, and setting stop condition
1034     if (step->previous){
1035         src16 = step->previous->buf16;
1036         if (step->need_new_token)
1037             //while (step->more_tokens &&  !src16->utf16_len)
1038                 step->more_tokens
1039                     = icu_chain_step_next_token(chain, step->previous, status);
1040     }
1041     else { // first step can only work once on chain->src16 input buffer
1042         src16 = chain->src16;
1043         step->more_tokens = 1;
1044     }
1045
1046     // stop if nothing to process
1047     // i.e new token source was not properly assigned
1048     if (!step->more_tokens || !src16) // || !src16->utf16_len
1049         return 0;
1050
1051     //printf("icu_chain_step_next_token %d working\n", (int) step);
1052
1053
1054     // perform the work, eventually put this steps output in
1055     // step->buf16 or the chains UTF8 output buffers
1056     switch(step->type) {
1057     case ICU_chain_step_type_display:
1058         icu_utf16_to_utf8(chain->display8, src16, status);
1059         break;
1060     case ICU_chain_step_type_index:
1061         icu_utf16_to_utf8(chain->norm8, src16, status);
1062         break;
1063     case ICU_chain_step_type_sortkey:
1064         icu_utf16_to_utf8(chain->sort8, src16, status);
1065         break;
1066     case ICU_chain_step_type_casemap:
1067         icu_casemap_casemap(step->u.casemap,
1068                             step->buf16, src16, status);
1069         break;
1070     case ICU_chain_step_type_normalize:
1071         icu_normalizer_normalize(step->u.normalizer,
1072                                  step->buf16, src16, status);
1073         break;
1074     case ICU_chain_step_type_tokenize:
1075         // attach to new src16 token only first time during splitting
1076         if (step->need_new_token){
1077             icu_tokenizer_attach(step->u.tokenizer, src16, status);
1078             step->need_new_token = 0;
1079         }
1080         // splitting one src16 token into multiple buf16 tokens
1081         step->more_tokens
1082             = icu_tokenizer_next_token(step->u.tokenizer,
1083                                        step->buf16, status);
1084         // make sure to get new previous token if this one had been used up
1085         if (step->previous && !step->more_tokens){
1086             if (icu_chain_step_next_token(chain, step->previous, status)){
1087                 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1088                 step->need_new_token = 0;
1089                 step->more_tokens
1090                     = icu_tokenizer_next_token(step->u.tokenizer,
1091                                                step->buf16, status);
1092             }
1093         }
1094         if (0 == step->more_tokens)
1095             return 0;
1096         break;
1097     default:
1098         return 0;
1099         break;
1100     }
1101
1102
1103
1104     // stop further token processing if last step and
1105     // new tokens are needed from previous (non-existing) step
1106     if (!step->previous && step->need_new_token)
1107         step->more_tokens = 0;
1108
1109     //printf("%d %d %d\n",
1110     //       step->more_tokens, src16->utf16_len, step->buf16->utf16_len);
1111
1112
1113     if (U_FAILURE(*status))
1114         return 0;
1115
1116     return 1;
1117 };
1118
1119
1120
1121 int icu_chain_assign_cstr(struct icu_chain * chain,
1122                           const char * src8cstr,
1123                           UErrorCode *status)
1124 {
1125     struct icu_chain_step * stp = 0;
1126
1127     if (!chain || !src8cstr)
1128         return 0;
1129
1130     stp = chain->steps;
1131
1132     // clear token count
1133     chain->token_count = 0;
1134
1135     // clear all steps stop states
1136
1137     while (stp){
1138         stp->more_tokens = 1;
1139         stp->need_new_token = 1;
1140         stp = stp->previous;
1141     }
1142
1143     // finally convert UTF8 to UTF16 string
1144     icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
1145
1146     if (U_FAILURE(*status))
1147         return 0;
1148
1149     return 1;
1150 };
1151
1152
1153
1154 int icu_chain_next_token(struct icu_chain * chain,
1155                          UErrorCode *status)
1156 {
1157     int success = 0;
1158
1159     if (!chain || !chain->steps)
1160         return 0;
1161
1162     success = icu_chain_step_next_token(chain, chain->steps, status);
1163
1164     if (success){
1165         chain->token_count++;
1166         return chain->token_count;
1167     }
1168
1169     return 0;
1170 };
1171
1172 int icu_chain_get_token_count(struct icu_chain * chain)
1173 {
1174     if (!chain)
1175         return 0;
1176
1177     return chain->token_count;
1178 };
1179
1180
1181
1182 const char * icu_chain_get_display(struct icu_chain * chain)
1183 {
1184     if (chain->display8)
1185         return icu_buf_utf8_to_cstr(chain->display8);
1186
1187     return 0;
1188 };
1189
1190 const char * icu_chain_get_norm(struct icu_chain * chain)
1191 {
1192     if (chain->norm8)
1193         return icu_buf_utf8_to_cstr(chain->norm8);
1194
1195     return 0;
1196 };
1197
1198 const char * icu_chain_get_sort(struct icu_chain * chain)
1199 {
1200     if (chain->sort8)
1201         return icu_buf_utf8_to_cstr(chain->sort8);
1202
1203     return 0;
1204 };
1205
1206
1207
1208
1209 #endif // HAVE_ICU
1210
1211
1212
1213
1214 /*
1215  * Local variables:
1216  * c-basic-offset: 4
1217  * indent-tabs-mode: nil
1218  * End:
1219  * vim: shiftwidth=4 tabstop=8 expandtab
1220  */