1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2009 Index Data
3 * See the file LICENSE for details.
8 * \brief ICU tokenization - using ubrk_-functions from ICU
16 #include <yaz/xmalloc.h>
18 #include <yaz/icu_I18N.h>
26 #include <unicode/ustring.h> /* some more string fcns*/
27 #include <unicode/uchar.h> /* char names */
29 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
32 struct icu_tokenizer * tokenizer
33 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
35 tokenizer->action = action;
38 tokenizer->token_count = 0;
39 tokenizer->token_id = 0;
40 tokenizer->token_start = 0;
41 tokenizer->token_end = 0;
43 switch (tokenizer->action)
47 tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
51 tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
55 tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
59 tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
63 tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
66 *status = U_UNSUPPORTED_ERROR;
71 /* ICU error stuff is a very funny business */
72 if (U_SUCCESS(*status))
75 /* freeing if failed */
76 icu_tokenizer_destroy(tokenizer);
80 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
85 ubrk_close(tokenizer->bi);
90 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
91 struct icu_buf_utf16 * src16,
94 if (!tokenizer || !tokenizer->bi || !src16)
97 tokenizer->buf16 = src16;
98 tokenizer->token_count = 0;
99 tokenizer->token_id = 0;
100 tokenizer->token_start = 0;
101 tokenizer->token_end = 0;
103 ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
105 if (U_FAILURE(*status))
111 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
112 struct icu_buf_utf16 * tkn16,
115 int32_t tkn_start = 0;
119 if (!tokenizer || !tokenizer->bi
120 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
123 never change tokenizer->buf16 and keep always invariant
124 0 <= tokenizer->token_start
125 <= tokenizer->token_end
126 <= tokenizer->buf16->utf16_len
127 returns length of token
130 if (0 == tokenizer->token_end) /* first call */
131 tkn_start = ubrk_first(tokenizer->bi);
132 else /* successive calls */
133 tkn_start = tokenizer->token_end;
135 /* get next position */
136 tkn_end = ubrk_next(tokenizer->bi);
138 /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
139 if (UBRK_DONE == tkn_end)
140 tkn_end = tokenizer->buf16->utf16_len;
142 /* copy out if everything is well */
143 if (U_FAILURE(*status))
146 /* everything OK, now update internal state */
147 tkn_len = tkn_end - tkn_start;
151 tokenizer->token_count++;
152 tokenizer->token_id++;
154 tokenizer->token_id = 0;
156 tokenizer->token_start = tkn_start;
157 tokenizer->token_end = tkn_end;
159 /* copying into token buffer if it exists */
161 if (tkn16->utf16_cap < tkn_len)
162 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
164 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
167 tkn16->utf16_len = tkn_len;
173 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
175 return tokenizer->token_count;
178 #endif /* YAZ_HAVE_ICU */
183 * c-file-style: "Stroustrup"
184 * indent-tabs-mode: nil
186 * vim: shiftwidth=4 tabstop=8 expandtab