src/icu_tokenizer.c

   1 /* This file is part of the YAZ toolkit.
   2  * Copyright (C) 1995-2009 Index Data
   3  * See the file LICENSE for details.
   4  */
   5
   6 /**
   7  * \file
   8  * \brief ICU tokenization - using ubrk_-functions from ICU
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif
  14
  15 #if YAZ_HAVE_ICU
  16 #include <yaz/xmalloc.h>
  17
  18 #include <yaz/icu_I18N.h>
  19
  20 #include <yaz/log.h>
  21
  22 #include <string.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25
  26 #include <unicode/ustring.h>  /* some more string fcns*/
  27 #include <unicode/uchar.h>    /* char names           */
  28
  29 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
  30                                            UErrorCode *status)
  31 {
  32     struct icu_tokenizer * tokenizer
  33         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  34
  35     tokenizer->action = action;
  36     tokenizer->bi = 0;
  37     tokenizer->buf16 = 0;
  38     tokenizer->token_count = 0;
  39     tokenizer->token_id = 0;
  40     tokenizer->token_start = 0;
  41     tokenizer->token_end = 0;
  42
  43     switch (tokenizer->action)
  44     {
  45     case 'l':
  46     case 'L':
  47         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
  48         break;
  49     case 's':
  50     case 'S':
  51         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
  52         break;
  53     case 'w':
  54     case 'W':
  55         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
  56         break;
  57     case 'c':
  58     case 'C':
  59         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
  60         break;
  61     case 't':
  62     case 'T':
  63         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
  64         break;
  65     default:
  66         *status = U_UNSUPPORTED_ERROR;
  67         return 0;
  68         break;
  69     }
  70
  71     /* ICU error stuff is a very  funny business */
  72     if (U_SUCCESS(*status))
  73         return tokenizer;
  74
  75     /* freeing if failed */
  76     icu_tokenizer_destroy(tokenizer);
  77     return 0;
  78 }
  79
  80 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
  81 {
  82     if (tokenizer)
  83     {
  84         if (tokenizer->bi)
  85             ubrk_close(tokenizer->bi);
  86         xfree(tokenizer);
  87     }
  88 }
  89
  90 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
  91                          struct icu_buf_utf16 * src16,
  92                          UErrorCode *status)
  93 {
  94     if (!tokenizer || !tokenizer->bi || !src16)
  95         return 0;
  96
  97     tokenizer->buf16 = src16;
  98     tokenizer->token_count = 0;
  99     tokenizer->token_id = 0;
 100     tokenizer->token_start = 0;
 101     tokenizer->token_end = 0;
 102
 103     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 104
 105     if (U_FAILURE(*status))
 106         return 0;
 107
 108     return 1;
 109 }
 110
 111 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 112                                  struct icu_buf_utf16 * tkn16,
 113                                  UErrorCode *status)
 114 {
 115     int32_t tkn_start = 0;
 116     int32_t tkn_end = 0;
 117     int32_t tkn_len = 0;
 118
 119     if (!tokenizer || !tokenizer->bi
 120         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 121         return 0;
 122     /*
 123     never change tokenizer->buf16 and keep always invariant
 124     0 <= tokenizer->token_start
 125        <= tokenizer->token_end
 126        <= tokenizer->buf16->utf16_len
 127     returns length of token
 128     */
 129
 130     if (0 == tokenizer->token_end) /* first call */
 131         tkn_start = ubrk_first(tokenizer->bi);
 132     else /* successive calls */
 133         tkn_start = tokenizer->token_end;
 134
 135     /* get next position */
 136     tkn_end = ubrk_next(tokenizer->bi);
 137
 138     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 139     if (UBRK_DONE == tkn_end)
 140         tkn_end = tokenizer->buf16->utf16_len;
 141
 142     /* copy out if everything is well */
 143     if (U_FAILURE(*status))
 144         return 0;
 145
 146     /* everything OK, now update internal state */
 147     tkn_len = tkn_end - tkn_start;
 148
 149     if (0 < tkn_len)
 150     {
 151         tokenizer->token_count++;
 152         tokenizer->token_id++;
 153     } else {
 154         tokenizer->token_id = 0;
 155     }
 156     tokenizer->token_start = tkn_start;
 157     tokenizer->token_end = tkn_end;
 158
 159     /* copying into token buffer if it exists */
 160     if (tkn16){
 161         if (tkn16->utf16_cap < tkn_len)
 162             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 163
 164         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 165                   tkn_len);
 166
 167         tkn16->utf16_len = tkn_len;
 168     }
 169
 170     return tkn_len;
 171 }
 172
 173 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 174 {
 175     return tokenizer->token_count;
 176 }
 177
 178 #endif /* YAZ_HAVE_ICU */
 179
 180 /*
 181  * Local variables:
 182  * c-basic-offset: 4
 183  * c-file-style: "Stroustrup"
 184  * indent-tabs-mode: nil
 185  * End:
 186  * vim: shiftwidth=4 tabstop=8 expandtab
 187  */
 188