1 /* This is a simple program which uses libstemmer to provide a command
2 * line interface for stemming using any of the algorithms provided.
6 #include <stdlib.h> /* for malloc, free */
7 #include <string.h> /* for memmove */
8 #include <ctype.h> /* for isupper, tolower */
10 #include "libstemmer.h"
12 const char * progname;
13 static int pretty = 1;
16 stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
20 sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
31 if (ch == '\n' || ch == EOF) break;
35 realloc(b, (lim + INC) * sizeof(sb_symbol));
36 if (newb == 0) goto error;
40 /* Update count of utf-8 characters. */
41 if (ch < 0x80 || ch > 0xBF) inlen += 1;
42 /* force lower case: */
43 if (isupper(ch)) ch = tolower(ch);
51 const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
54 fprintf(stderr, "Out of memory");
60 fwrite(b, i, 1, f_out);
62 } else if (pretty == 2) {
63 fwrite(b, i, 1, f_out);
64 if (sb_stemmer_length(stemmer) > 0) {
67 for (j = 30 - inlen; j > 0; j--)
71 for (j = 30; j > 0; j--)
77 fputs((char *)stemmed, f_out);
88 /** Display the command line syntax, and then exit.
89 * @param n The value to exit with.
94 printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
96 "The input file consists of a list of words to be stemmed, one per\n"
97 "line. Words should be in lower case, but (for English) A-Z letters\n"
98 "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
101 "If -c is given, the argument is the character encoding of the input\n"
102 "and output files. If it is omitted, the UTF-8 encoding is used.\n"
104 "If -p is given the output file consists of each word of the input\n"
105 "file followed by \"->\" followed by its stemmed equivalent.\n"
106 "If -p2 is given the output file is a two column layout containing\n"
107 "the input words in the first column and the stemmed eqivalents in\n"
108 "the second column.\n"
109 "Otherwise, the output file consists of the stemmed words, one per\n"
112 "-h displays this help\n",
118 main(int argc, char * argv[])
124 struct sb_stemmer * stemmer;
126 char * language = "english";
127 char * charenc = NULL;
138 if (strcmp(s, "-o") == 0) {
140 fprintf(stderr, "%s requires an argument\n", s);
144 } else if (strcmp(s, "-i") == 0) {
146 fprintf(stderr, "%s requires an argument\n", s);
150 } else if (strcmp(s, "-l") == 0) {
152 fprintf(stderr, "%s requires an argument\n", s);
155 language = argv[i++];
156 } else if (strcmp(s, "-c") == 0) {
158 fprintf(stderr, "%s requires an argument\n", s);
162 } else if (strcmp(s, "-p2") == 0) {
164 } else if (strcmp(s, "-p") == 0) {
166 } else if (strcmp(s, "-h") == 0) {
169 fprintf(stderr, "option %s unknown\n", s);
173 fprintf(stderr, "unexpected parameter %s\n", s);
178 /* prepare the files */
179 f_in = (in == 0) ? stdin : fopen(in, "r");
181 fprintf(stderr, "file %s not found\n", in);
184 f_out = (out == 0) ? stdout : fopen(out, "w");
186 fprintf(stderr, "file %s cannot be opened\n", out);
190 /* do the stemming process: */
191 stemmer = sb_stemmer_new(language, charenc);
193 if (charenc == NULL) {
194 fprintf(stderr, "language `%s' not available for stemming\n", language);
197 fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
201 stem_file(stemmer, f_in, f_out);
202 sb_stemmer_delete(stemmer);
204 if (in != 0) (void) fclose(f_in);
205 if (out != 0) (void) fclose(f_out);