1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2010 Index Data
3 * See the file LICENSE for details.
7 * \file solrtransform.c
8 * \brief Implements SOLR transform (SOLR to RPN conversion).
10 * Evaluation order of rules:
25 #include <yaz/rpn2solr.h>
26 #include <yaz/xmalloc.h>
27 #include <yaz/diagsrw.h>
28 #include <yaz/tokenizer.h>
29 #include <yaz/wrbuf.h>
30 #include <yaz/z-core.h>
31 #include <yaz/matchstr.h>
32 #include <yaz/oid_db.h>
36 struct solr_prop_entry {
39 Z_AttributeList attr_list;
40 struct solr_prop_entry *next;
43 struct solr_transform_t_ {
44 struct solr_prop_entry *entry;
45 yaz_tok_cfg_t tok_cfg;
53 /* TODO Utility functions, evt. split out int separate file */
54 int solr_strcmp(const char *s1, const char *s2) {
55 return cql_strcmp(s1, s2);
58 int solr_strncmp(const char *s1, const char *s2, size_t n) {
59 return cql_strncmp(s1, s2, n);
63 const char *solr_uri(void)
65 return "TODO:SOLR URI";
68 void solr_buf_write_handler (const char *b, void *client_data)
70 struct solr_buf_write_info *info = (struct solr_buf_write_info *)client_data;
72 if (info->off < 0 || (info->off + l >= info->max))
77 memcpy (info->buf + info->off, b, l);
82 /* Utillity functions end */
84 solr_transform_t solr_transform_create(void)
86 solr_transform_t ct = (solr_transform_t) xmalloc(sizeof(*ct));
87 ct->tok_cfg = yaz_tok_cfg_create();
88 ct->w = wrbuf_alloc();
92 ct->nmem = nmem_create();
96 static int solr_transform_parse_tok_line(solr_transform_t ct,
101 Z_AttributeElement *ae[20];
102 int ret = 0; /* 0=OK, != 0 FAIL */
104 t = yaz_tok_move(tp);
106 while (t == YAZ_TOK_STRING && ae_num < 20)
108 WRBUF type_str = wrbuf_alloc();
110 Z_AttributeElement *elem = 0;
111 const char *value_str = 0;
112 /* attset type=value OR type=value */
114 elem = (Z_AttributeElement *) nmem_malloc(ct->nmem, sizeof(*elem));
115 elem->attributeSet = 0;
117 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
118 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
119 t = yaz_tok_move(tp);
120 if (t == YAZ_TOK_EOF)
122 wrbuf_destroy(type_str);
124 wrbuf_destroy(set_str);
127 if (t == YAZ_TOK_STRING)
129 wrbuf_puts(ct->w, " ");
130 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
134 yaz_string_to_oid_nmem(yaz_oid_std(), CLASS_ATTSET,
135 wrbuf_cstr(set_str), ct->nmem);
137 type_str = wrbuf_alloc();
138 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
139 t = yaz_tok_move(tp);
141 elem->attributeType = nmem_intdup(ct->nmem, 0);
142 if (sscanf(wrbuf_cstr(type_str), ODR_INT_PRINTF, elem->attributeType)
145 wrbuf_destroy(type_str);
147 wrbuf_destroy(set_str);
148 yaz_log(YLOG_WARN, "Expected numeric attribute type");
153 wrbuf_destroy(type_str);
155 wrbuf_destroy(set_str);
159 yaz_log(YLOG_WARN, "Expected = after after attribute type");
163 t = yaz_tok_move(tp);
164 if (t != YAZ_TOK_STRING) /* value */
166 yaz_log(YLOG_WARN, "Missing attribute value");
170 value_str = yaz_tok_parse_string(tp);
171 if (isdigit(*value_str))
173 elem->which = Z_AttributeValue_numeric;
174 elem->value.numeric =
175 nmem_intdup(ct->nmem, atoi(value_str));
179 Z_ComplexAttribute *ca = (Z_ComplexAttribute *)
180 nmem_malloc(ct->nmem, sizeof(*ca));
181 elem->which = Z_AttributeValue_complex;
182 elem->value.complex = ca;
184 ca->list = (Z_StringOrNumeric **)
185 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric *));
186 ca->list[0] = (Z_StringOrNumeric *)
187 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric));
188 ca->list[0]->which = Z_StringOrNumeric_string;
189 ca->list[0]->u.string = nmem_strdup(ct->nmem, value_str);
190 ca->num_semanticAction = 0;
191 ca->semanticAction = 0;
193 wrbuf_puts(ct->w, "=");
194 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
195 t = yaz_tok_move(tp);
196 wrbuf_puts(ct->w, " ");
199 if (ret == 0) /* OK? */
201 struct solr_prop_entry **pp = &ct->entry;
204 *pp = (struct solr_prop_entry *) xmalloc(sizeof(**pp));
205 (*pp)->pattern = xstrdup(pattern);
206 (*pp)->value = xstrdup(wrbuf_cstr(ct->w));
208 (*pp)->attr_list.num_attributes = ae_num;
210 (*pp)->attr_list.attributes = 0;
213 (*pp)->attr_list.attributes = (Z_AttributeElement **)
214 nmem_malloc(ct->nmem,
215 ae_num * sizeof(Z_AttributeElement *));
216 memcpy((*pp)->attr_list.attributes, ae,
217 ae_num * sizeof(Z_AttributeElement *));
223 ODR pr = odr_createmem(ODR_PRINT);
224 Z_AttributeList *alp = &(*pp)->attr_list;
225 odr_setprint(pr, yaz_log_file());
226 z_AttributeList(pr, &alp, 0, 0);
234 int solr_transform_define_pattern(solr_transform_t ct, const char *pattern,
238 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, value);
239 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
240 r = solr_transform_parse_tok_line(ct, pattern, tp);
241 yaz_tok_parse_destroy(tp);
245 solr_transform_t solr_transform_open_FILE(FILE *f)
247 solr_transform_t ct = solr_transform_create();
250 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
252 while (fgets(line, sizeof(line)-1, f))
254 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, line);
257 t = yaz_tok_move(tp);
258 if (t == YAZ_TOK_STRING)
260 char * pattern = xstrdup(yaz_tok_parse_string(tp));
261 t = yaz_tok_move(tp);
264 yaz_tok_parse_destroy(tp);
265 solr_transform_close(ct);
268 if (solr_transform_parse_tok_line(ct, pattern, tp))
270 yaz_tok_parse_destroy(tp);
271 solr_transform_close(ct);
276 else if (t != YAZ_TOK_EOF)
278 yaz_tok_parse_destroy(tp);
279 solr_transform_close(ct);
282 yaz_tok_parse_destroy(tp);
287 void solr_transform_close(solr_transform_t ct)
289 struct solr_prop_entry *pe;
295 struct solr_prop_entry *pe_next = pe->next;
302 yaz_tok_cfg_destroy(ct->tok_cfg);
303 wrbuf_destroy(ct->w);
304 nmem_destroy(ct->nmem);
308 solr_transform_t solr_transform_open_fname(const char *fname)
311 FILE *f = fopen(fname, "r");
314 ct = solr_transform_open_FILE(f);
320 struct Z_AttributeElement {
321 Z_AttributeSetId *attributeSet; /* OPT */
326 Z_ComplexAttribute *complex;
327 #define Z_AttributeValue_numeric 1
328 #define Z_AttributeValue_complex 2
333 static int compare_attr(Z_AttributeElement *a, Z_AttributeElement *b)
335 ODR odr_a = odr_createmem(ODR_ENCODE);
336 ODR odr_b = odr_createmem(ODR_ENCODE);
341 z_AttributeElement(odr_a, &a, 0, 0);
342 z_AttributeElement(odr_b, &b, 0, 0);
344 buf_a = odr_getbuf(odr_a, &len_a, 0);
345 buf_b = odr_getbuf(odr_b, &len_b, 0);
347 ret = yaz_memcmp(buf_a, buf_b, len_a, len_b);
354 const char *solr_lookup_reverse(solr_transform_t ct,
355 const char *category,
356 Z_AttributeList *attributes)
358 struct solr_prop_entry *e;
359 size_t clen = strlen(category);
360 for (e = ct->entry; e; e = e->next)
362 if (!strncmp(e->pattern, category, clen))
364 /* category matches.. See if attributes in pattern value
365 are all listed in actual attributes */
367 for (i = 0; i < e->attr_list.num_attributes; i++)
369 /* entry attribute */
370 Z_AttributeElement *e_ae = e->attr_list.attributes[i];
372 for (j = 0; j < attributes->num_attributes; j++)
374 /* actual attribute */
375 Z_AttributeElement *a_ae = attributes->attributes[j];
376 int r = compare_attr(e_ae, a_ae);
380 if (j == attributes->num_attributes)
381 break; /* i was not found at all.. try next pattern */
384 if (i == e->attr_list.num_attributes)
385 return e->pattern + clen;
391 static const char *solr_lookup_property(solr_transform_t ct,
392 const char *pat1, const char *pat2,
396 struct solr_prop_entry *e;
398 if (pat1 && pat2 && pat3)
399 sprintf(pattern, "%.39s.%.39s.%.39s", pat1, pat2, pat3);
400 else if (pat1 && pat2)
401 sprintf(pattern, "%.39s.%.39s", pat1, pat2);
402 else if (pat1 && pat3)
403 sprintf(pattern, "%.39s.%.39s", pat1, pat3);
405 sprintf(pattern, "%.39s", pat1);
409 for (e = ct->entry; e; e = e->next)
411 if (!solr_strcmp(e->pattern, pattern))
417 int solr_pr_attr_uri(solr_transform_t ct, const char *category,
418 const char *uri, const char *val, const char *default_val,
419 void (*pr)(const char *buf, void *client_data),
424 const char *eval = val ? val : default_val;
425 const char *prefix = 0;
429 struct solr_prop_entry *e;
431 for (e = ct->entry; e; e = e->next)
432 if (!memcmp(e->pattern, "set.", 4) && e->value &&
433 !strcmp(e->value, uri))
435 prefix = e->pattern+4;
438 /* must have a prefix now - if not it's an error */
444 res = solr_lookup_property(ct, category, prefix, eval);
445 /* we have some aliases for some relations unfortunately.. */
446 if (!res && !prefix && !strcmp(category, "relation"))
448 if (!strcmp(val, "=="))
449 res = solr_lookup_property(ct, category, prefix, "exact");
450 if (!strcmp(val, "="))
451 res = solr_lookup_property(ct, category, prefix, "eq");
452 if (!strcmp(val, "<="))
453 res = solr_lookup_property(ct, category, prefix, "le");
454 if (!strcmp(val, ">="))
455 res = solr_lookup_property(ct, category, prefix, "ge");
458 res = solr_lookup_property(ct, category, prefix, "*");
464 const char *cp0 = res, *cp1;
465 while ((cp1 = strchr(cp0, '=')))
468 while (*cp1 && *cp1 != ' ')
470 if (cp1 - cp0 >= (ptrdiff_t) sizeof(buf))
472 memcpy(buf, cp0, cp1 - cp0);
474 (*pr)("@attr ", client_data);
476 for (i = 0; buf[i]; i++)
479 (*pr)(eval, client_data);
485 (*pr)(tmp, client_data);
488 (*pr)(" ", client_data);
496 if (errcode && !ct->error)
500 ct->addinfo = xstrdup(val);
507 int solr_pr_attr(solr_transform_t ct, const char *category,
508 const char *val, const char *default_val,
509 void (*pr)(const char *buf, void *client_data),
513 return solr_pr_attr_uri(ct, category, 0 /* uri */,
514 val, default_val, pr, client_data, errcode);
518 static void solr_pr_int(int val,
519 void (*pr)(const char *buf, void *client_data),
522 char buf[21]; /* enough characters to 2^64 */
523 sprintf(buf, "%d", val);
524 (*pr)(buf, client_data);
525 (*pr)(" ", client_data);
529 static int solr_pr_prox(solr_transform_t ct, struct solr_node *mods,
530 void (*pr)(const char *buf, void *client_data),
534 int distance; /* to be filled in later depending on unit */
535 int distance_defined = 0;
537 int proxrel = 2; /* less than or equal */
538 int unit = 2; /* word */
542 const char *name = mods->u.st.index;
543 const char *term = mods->u.st.term;
544 const char *relation = mods->u.st.relation;
546 if (!strcmp(name, "distance")) {
547 distance = strtol(term, (char**) 0, 0);
548 distance_defined = 1;
549 if (!strcmp(relation, "="))
551 else if (!strcmp(relation, ">"))
553 else if (!strcmp(relation, "<"))
555 else if (!strcmp(relation, ">="))
557 else if (!strcmp(relation, "<="))
559 else if (!strcmp(relation, "<>"))
563 ct->error = YAZ_SRW_UNSUPP_PROX_RELATION;
564 ct->addinfo = xstrdup(relation);
568 else if (!strcmp(name, "ordered"))
570 else if (!strcmp(name, "unordered"))
572 else if (!strcmp(name, "unit"))
574 if (!strcmp(term, "word"))
576 else if (!strcmp(term, "sentence"))
578 else if (!strcmp(term, "paragraph"))
580 else if (!strcmp(term, "element"))
584 ct->error = YAZ_SRW_UNSUPP_PROX_UNIT;
585 ct->addinfo = xstrdup(term);
591 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
592 ct->addinfo = xstrdup(name);
595 mods = mods->u.st.modifiers;
598 if (!distance_defined)
599 distance = (unit == 2) ? 1 : 0;
601 solr_pr_int(exclusion, pr, client_data);
602 solr_pr_int(distance, pr, client_data);
603 solr_pr_int(ordered, pr, client_data);
604 solr_pr_int(proxrel, pr, client_data);
605 (*pr)("k ", client_data);
606 solr_pr_int(unit, pr, client_data);
611 /* Returns location of first wildcard character in the `length'
612 * characters starting at `term', or a null pointer of there are
613 * none -- like memchr().
615 static const char *wcchar(int start, const char *term, int length)
619 if (start || term[-1] != '\\')
620 if (strchr("*?", *term))
630 /* ### checks for SOLR relation-name rather than Type-1 attribute */
631 static int has_modifier(struct solr_node *cn, const char *name) {
632 struct solr_node *mod;
633 for (mod = cn->u.st.modifiers; mod != 0; mod = mod->u.st.modifiers) {
634 if (!strcmp(mod->u.st.index, name))
642 static void emit_term(solr_transform_t ct,
643 struct solr_node *cn,
644 const char *term, int length,
645 void (*pr)(const char *buf, void *client_data),
649 const char *ns = cn->u.st.index_uri;
650 int process_term = !has_modifier(cn, "regexp");
653 assert(cn->which == SOLR_NODE_ST);
655 if (process_term && length > 0)
657 if (length > 1 && term[0] == '^' && term[length-1] == '^')
659 solr_pr_attr(ct, "position", "firstAndLast", 0,
660 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
664 else if (term[0] == '^')
666 solr_pr_attr(ct, "position", "first", 0,
667 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
671 else if (term[length-1] == '^')
673 solr_pr_attr(ct, "position", "last", 0,
674 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
679 solr_pr_attr(ct, "position", "any", 0,
680 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
684 if (process_term && length > 0)
686 const char *first_wc = wcchar(1, term, length);
687 const char *second_wc = first_wc ?
688 wcchar(0, first_wc+1, length-(first_wc-term)-1) : 0;
690 /* Check for well-known globbing patterns that represent
691 * simple truncation attributes as expected by, for example,
692 * Bath-compliant server. If we find such a pattern but
693 * there's no mapping for it, that's fine: we just use a
694 * general pattern-matching attribute.
696 if (first_wc == term && second_wc == term + length-1
697 && *first_wc == '*' && *second_wc == '*'
698 && solr_pr_attr(ct, "truncation", "both", 0, pr, client_data, 0))
703 else if (first_wc == term && second_wc == 0 && *first_wc == '*'
704 && solr_pr_attr(ct, "truncation", "left", 0,
710 else if (first_wc == term + length-1 && second_wc == 0
712 && solr_pr_attr(ct, "truncation", "right", 0,
719 /* We have one or more wildcard characters, but not in a
720 * way that can be dealt with using only the standard
721 * left-, right- and both-truncation attributes. We need
722 * to translate the pattern into a Z39.58-type pattern,
723 * which has been supported in BIB-1 since 1996. If
724 * there's no configuration element for "truncation.z3958"
725 * we indicate this as error 28 "Masking character not
729 solr_pr_attr(ct, "truncation", "z3958", 0,
730 pr, client_data, YAZ_SRW_MASKING_CHAR_UNSUPP);
731 z3958_mem = (char *) xmalloc(length+1);
732 for (i = 0; i < length; i++)
734 if (i > 0 && term[i-1] == '\\')
735 z3958_mem[i] = term[i];
736 else if (term[i] == '*')
738 else if (term[i] == '?')
741 z3958_mem[i] = term[i];
743 z3958_mem[length] = '\0';
747 /* No masking characters. Use "truncation.none" if given. */
748 solr_pr_attr(ct, "truncation", "none", 0,
753 solr_pr_attr_uri(ct, "index", ns,
754 cn->u.st.index, "serverChoice",
755 pr, client_data, YAZ_SRW_UNSUPP_INDEX);
757 if (cn->u.st.modifiers)
759 struct solr_node *mod = cn->u.st.modifiers;
760 for (; mod; mod = mod->u.st.modifiers)
762 solr_pr_attr(ct, "relationModifier", mod->u.st.index, 0,
763 pr, client_data, YAZ_SRW_UNSUPP_RELATION_MODIFIER);
767 (*pr)("\"", client_data);
768 for (i = 0; i<length; i++)
770 /* pr(int) each character */
771 /* we do not need to deal with \-sequences because the
772 SOLR and PQF terms have same \-format, bug #1988 */
777 (*pr)(buf, client_data);
779 (*pr)("\" ", client_data);
783 static void emit_terms(solr_transform_t ct,
784 struct solr_node *cn,
785 void (*pr)(const char *buf, void *client_data),
789 struct solr_node *ne = cn->u.st.extra_terms;
792 (*pr)("@", client_data);
793 (*pr)(op, client_data);
794 (*pr)(" ", client_data);
796 emit_term(ct, cn, cn->u.st.term, strlen(cn->u.st.term),
798 for (; ne; ne = ne->u.st.extra_terms)
800 if (ne->u.st.extra_terms)
802 (*pr)("@", client_data);
803 (*pr)(op, client_data);
804 (*pr)(" ", client_data);
806 emit_term(ct, cn, ne->u.st.term, strlen(ne->u.st.term),
811 static void emit_wordlist(solr_transform_t ct,
812 struct solr_node *cn,
813 void (*pr)(const char *buf, void *client_data),
817 const char *cp0 = cn->u.st.term;
819 const char *last_term = 0;
825 cp1 = strchr(cp0, ' ');
828 (*pr)("@", client_data);
829 (*pr)(op, client_data);
830 (*pr)(" ", client_data);
831 emit_term(ct, cn, last_term, last_length, pr, client_data);
835 last_length = cp1 - cp0;
837 last_length = strlen(cp0);
841 emit_term(ct, cn, last_term, last_length, pr, client_data);
844 void solr_transform_r(solr_transform_t ct,
845 struct solr_node *cn,
846 void (*pr)(const char *buf, void *client_data),
850 struct solr_node *mods;
857 ns = cn->u.st.index_uri;
860 /* TODO If relevant fix with solr_uri */
861 if (!strcmp(ns, solr_uri())
862 && cn->u.st.index && !solr_strcmp(cn->u.st.index, "resultSet"))
864 (*pr)("@set \"", client_data);
865 (*pr)(cn->u.st.term, client_data);
866 (*pr)("\" ", client_data);
874 ct->error = YAZ_SRW_UNSUPP_CONTEXT_SET;
878 solr_pr_attr(ct, "always", 0, 0, pr, client_data, 0);
879 solr_pr_attr(ct, "relation", cn->u.st.relation, 0, pr, client_data,
880 YAZ_SRW_UNSUPP_RELATION);
881 solr_pr_attr(ct, "structure", cn->u.st.relation, 0,
882 pr, client_data, YAZ_SRW_UNSUPP_COMBI_OF_RELATION_AND_TERM);
883 if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "all"))
884 emit_wordlist(ct, cn, pr, client_data, "and");
885 else if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "any"))
886 emit_wordlist(ct, cn, pr, client_data, "or");
888 emit_terms(ct, cn, pr, client_data, "and");
891 (*pr)("@", client_data);
892 (*pr)(cn->u.boolean.value, client_data);
893 (*pr)(" ", client_data);
894 mods = cn->u.boolean.modifiers;
895 if (!strcmp(cn->u.boolean.value, "prox"))
897 if (!solr_pr_prox(ct, mods, pr, client_data))
902 /* Boolean modifiers other than on proximity not supported */
903 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
904 ct->addinfo = xstrdup(mods->u.st.index);
908 solr_transform_r(ct, cn->u.boolean.left, pr, client_data);
909 solr_transform_r(ct, cn->u.boolean.right, pr, client_data);
913 fprintf(stderr, "Fatal: impossible SOLR node-type %d\n", cn->which);
918 int solr_transform(solr_transform_t ct, struct solr_node *cn,
919 void (*pr)(const char *buf, void *client_data),
922 struct solr_prop_entry *e;
923 NMEM nmem = nmem_create();
929 for (e = ct->entry; e ; e = e->next)
931 /* TODO remove as SOLR dont supports sets.
932 if (!solr_strncmp(e->pattern, "set.", 4))
933 solr_apply_prefix(nmem, cn, e->pattern+4, e->value);
934 else if (!solr_strcmp(e->pattern, "set"))
935 solr_apply_prefix(nmem, cn, 0, e->value);
938 solr_transform_r(ct, cn, pr, client_data);
944 int solr_transform_FILE(solr_transform_t ct, struct solr_node *cn, FILE *f)
946 /* We can use the cql_fputs util */
947 return solr_transform(ct, cn, cql_fputs, f);
950 int solr_transform_buf(solr_transform_t ct, struct solr_node *cn, char *out, int max)
952 struct solr_buf_write_info info;
958 r = solr_transform(ct, cn, cql_buf_write_handler, &info);
960 /* Attempt to write past end of buffer. For some reason, this
961 SRW diagnostic is deprecated, but it's so perfect for our
962 purposes that it would be stupid not to use it. */
964 ct->error = YAZ_SRW_TOO_MANY_CHARS_IN_QUERY;
965 sprintf(numbuf, "%ld", (long) info.max);
966 ct->addinfo = xstrdup(numbuf);
970 info.buf[info.off] = '\0';
974 int solr_transform_error(solr_transform_t ct, const char **addinfo)
976 *addinfo = ct->addinfo;
980 void solr_transform_set_error(solr_transform_t ct, int error, const char *addinfo)
983 ct->addinfo = addinfo ? xstrdup(addinfo) : 0;
990 * c-file-style: "Stroustrup"
991 * indent-tabs-mode: nil
993 * vim: shiftwidth=4 tabstop=8 expandtab