-/* $Id: icu_I18N.c,v 1.7 2007-05-07 12:52:04 marc Exp $
+/* $Id: icu_I18N.c,v 1.8 2007-05-09 14:01:21 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
int icu_check_status (UErrorCode status)
{
- //if(U_FAILURE(status))
- if(!U_SUCCESS(status))
+ if(U_FAILURE(status)){
yaz_log(YLOG_WARN,
"ICU: %d %s\n", status, u_errorName(status));
- return status;
+ return 0;
+ }
+ return 1;
+
}
buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
else
buf8->utf8
- = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity);
+ = (uint8_t *) realloc(buf8->utf8,
+ sizeof(uint8_t) * capacity);
buf8->utf8[0] = (uint8_t) 0;
buf8->utf8_len = 0;
buf8->utf8_cap = capacity;
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+ UErrorCode *status)
+{
+ struct icu_tokenizer * tokenizer
+ = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
+
+ strcpy(tokenizer->locale, locale);
+ tokenizer->action = action;
+ tokenizer->bi = 0;
+ tokenizer->buf16 = 0;
+ tokenizer->token_id = 0;
+ tokenizer->token_start = 0;
+ tokenizer->token_end = 0;
+
+
+ switch(tokenizer->action) {
+ case 'l':
+ tokenizer->bi
+ = ubrk_open(UBRK_LINE, tokenizer->locale,
+ 0, 0, status);
+ break;
+ case 's':
+ tokenizer->bi
+ = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
+ 0, 0, status);
+ break;
+ case 'w':
+ tokenizer->bi
+ = ubrk_open(UBRK_WORD, tokenizer->locale,
+ 0, 0, status);
+ break;
+ case 'c':
+ tokenizer->bi
+ = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
+ 0, 0, status);
+ break;
+ case 't':
+ tokenizer->bi
+ = ubrk_open(UBRK_TITLE, tokenizer->locale,
+ 0, 0, status);
+ break;
+ default:
+ *status = U_UNSUPPORTED_ERROR;
+ return 0;
+ break;
+ }
+
+ // ICU error stuff is a very funny business
+ if (U_SUCCESS(*status))
+ return tokenizer;
+
+ // reestablishing zero error state
+ //if (*status == U_USING_DEFAULT_WARNING)
+ // *status = U_ZERO_ERROR;
+
+
+ // freeing if failed
+ free(tokenizer);
+ return 0;
+};
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
+{
+
+ if (tokenizer) {
+ if (tokenizer->bi)
+ ubrk_close(tokenizer->bi);
+ free(tokenizer);
+ }
+};
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * src16,
+ UErrorCode *status)
+{
+ if (!tokenizer || !tokenizer->bi || !src16)
+ return 0;
+
+ tokenizer->buf16 = src16;
+
+ ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+
+
+ if (U_FAILURE(*status))
+ return 0;
+
+ return 1;
+};
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * tkn16,
+ UErrorCode *status)
+{
+ int32_t tkn_start = 0;
+ int32_t tkn_end = 0;
+
+
+ if (!tokenizer || !tokenizer->bi
+ || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
+ return 0;
+
+ // never change tokenizer->buf16 and keep always invariant
+ // 0 <= tokenizer->token_start
+ // <= tokenizer->token_end
+ // <= tokenizer->buf16->utf16_len
+ // returns length of token
+
+ if (0 == tokenizer->token_end) // first call
+ tkn_start = ubrk_first(tokenizer->bi);
+ else //successive calls
+ tkn_start = tokenizer->token_end;
+
+ // get next position
+ tkn_end = ubrk_next(tokenizer->bi);
+
+ // repairing invariant at end of ubrk, which is UBRK_DONE = -1
+ if (UBRK_DONE == tkn_end)
+ tkn_end = tokenizer->buf16->utf16_len;
+
+ // copy out if everything is well
+ if(U_FAILURE(*status))
+ return 0;
+
+ tokenizer->token_id++;
+ tokenizer->token_start = tkn_start;
+ tokenizer->token_end = tkn_end;
+
+ // copying into token buffer if it exists
+ if (tkn16){
+ if (tkn16->utf16_cap < (tkn_end - tkn_start))
+ icu_buf_utf16_resize(tkn16, (size_t) (tkn_end - tkn_start) * 2);
+
+ u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
+ (tkn_end - tkn_start));
+
+ tkn16->utf16_len = (tkn_end - tkn_start);
+ }
+
+ return (tokenizer->token_end - tokenizer->token_start);
+}
+
+
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
+{
+ return tokenizer->token_id;
+};
+
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
+{
+ return tokenizer->token_start;
+};
+
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
+{
+ return tokenizer->token_end;
+};
+
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
+{
+ return (tokenizer->token_end - tokenizer->token_start);
+};
+
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
+{
+ return tokenizer->token_count;
+};
+
+
+
#endif // HAVE_ICU
-/* $Id: icu_I18N.h,v 1.7 2007-05-07 12:52:04 marc Exp $
+/* $Id: icu_I18N.h,v 1.8 2007-05-09 14:01:21 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
//#include <unicode/ucnv.h> /* C Converter API */
//#include <unicode/ustring.h> /* some more string fcns*/
//#include <unicode/uloc.h>
-//#include <unicode/ubrk.h>
+#include <unicode/ubrk.h>
//#include <unicode/unistr.h>
+// forward declarations
+//struct UBreakIterator;
+
+
+
+
+// declared structs and functions
+
+
int icu_check_status (UErrorCode status);
struct icu_buf_utf16
struct icu_buf_utf16 * src16,
UErrorCode * status);
+struct icu_tokenizer
+{
+ char locale[16];
+ char action;
+ UBreakIterator* bi;
+ struct icu_buf_utf16 * buf16;
+ int32_t token_count;
+ int32_t token_id;
+ int32_t token_start;
+ int32_t token_end;
+ // keep always invariant
+ // 0 <= token_start
+ // <= token_end
+ // <= buf16->utf16_len
+ // and invariant
+ // 0 <= token_id <= token_count
+};
+
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+ UErrorCode *status);
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * src16, UErrorCode *status);
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
+ struct icu_buf_utf16 * tkn16,
+ UErrorCode *status);
+
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
+
+
+
#endif // HAVE_ICU
#endif // ICU_I18NL_H
-/* $Id: test_icu_I18N.c,v 1.10 2007-05-07 12:52:04 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.11 2007-05-09 14:01:21 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
UCollator *coll = ucol_open(locale, &status);
icu_check_status(status);
- if(!U_SUCCESS(status))
+ if(U_FAILURE(status))
return 0;
// assigning display terms and sort keys using buf 8 and buf16
}
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_normmap(int argc, char **argv)
+{
+
+
+}
+
+
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_tokenizer(int argc, char **argv)
+{
+
+ const char * src8cstr
+ = "Though I am not naturally honest, I am so sometimes by chance.";
+
+ UErrorCode status = U_ZERO_ERROR;
+ struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
+ struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
+ struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
+
+ printf("Input: '%s'\n", src8cstr);
+
+ // transforming to UTF16
+ icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
+ icu_check_status(status);
+
+ // set up tokenizer
+ struct icu_tokenizer * tokenizer
+ = icu_tokenizer_create("en", 's', &status);
+ icu_check_status(status);
+ YAZ_CHECK(tokenizer);
+
+ // attach text buffer to tokenizer
+ icu_tokenizer_attach(tokenizer, src16, &status);
+ icu_check_status(status);
+ YAZ_CHECK(tokenizer->bi);
+
+ // perform work on tokens
+ printf("Tokens: ");
+ while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
+ icu_check_status(status);
+
+ // converting to UTF8
+ icu_utf16_to_utf8(tkn8, tkn16, &status);
+
+ printf("'%s' ", tkn8->utf8);
+
+ //printf("token %d %d %d %d '%s'\n",
+ // icu_tokenizer_token_id(tokenizer),
+ // icu_tokenizer_token_start(tokenizer),
+ // icu_tokenizer_token_end(tokenizer),
+ // icu_tokenizer_token_length(tokenizer),
+ // tkn8->utf8);
+ }
+ printf(" (%d)(%d)\n", icu_tokenizer_token_id(tokenizer),
+ icu_tokenizer_token_count(tokenizer));
+
+ icu_tokenizer_destroy(tokenizer);
+ icu_buf_utf16_destroy(src16);
+ icu_buf_utf16_destroy(tkn16);
+ icu_buf_utf8_destroy(tkn8);
+}
+
+
+
+
+
#endif // HAVE_ICU
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
//test_icu_I18N_casemap_failures(argc, argv);
test_icu_I18N_casemap(argc, argv);
test_icu_I18N_sortmap(argc, argv);
-
+ test_icu_I18N_normmap(argc, argv);
+ test_icu_I18N_tokenizer(argc, argv);
+
#else // HAVE_ICU
printf("ICU unit tests omitted.\n"