1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
6 * \file solrtransform.c
7 * \brief Implements SOLR transform (SOLR to RPN conversion).
16 #include <yaz/rpn2solr.h>
17 #include <yaz/xmalloc.h>
18 #include <yaz/diagsrw.h>
19 #include <yaz/tokenizer.h>
20 #include <yaz/wrbuf.h>
21 #include <yaz/z-core.h>
22 #include <yaz/matchstr.h>
23 #include <yaz/oid_db.h>
27 struct solr_prop_entry {
30 Z_AttributeList attr_list;
31 struct solr_prop_entry *next;
34 struct solr_transform_t_ {
35 struct solr_prop_entry *entry;
36 yaz_tok_cfg_t tok_cfg;
44 /* TODO Utility functions, split out into separate file */
45 int solr_strcmp(const char *s1, const char *s2) {
46 return cql_strcmp(s1, s2);
49 int solr_strncmp(const char *s1, const char *s2, size_t n) {
50 return cql_strncmp(s1, s2, n);
54 const char *solr_uri(void)
56 return "TODO:SOLR URI";
59 void solr_buf_write_handler (const char *b, void *client_data)
61 struct solr_buf_write_info *info = (struct solr_buf_write_info *)client_data;
63 if (info->off < 0 || (info->off + l >= info->max))
68 memcpy (info->buf + info->off, b, l);
73 /* Utility functions end */
75 solr_transform_t solr_transform_create(void)
77 solr_transform_t ct = (solr_transform_t) xmalloc(sizeof(*ct));
78 ct->tok_cfg = yaz_tok_cfg_create();
79 ct->w = wrbuf_alloc();
83 ct->nmem = nmem_create();
87 static int solr_transform_parse_tok_line(solr_transform_t ct,
92 Z_AttributeElement *ae[20];
93 int ret = 0; /* 0=OK, != 0 FAIL */
97 while (t == YAZ_TOK_STRING && ae_num < 20)
99 WRBUF type_str = wrbuf_alloc();
101 Z_AttributeElement *elem = 0;
102 const char *value_str = 0;
103 /* attset type=value OR type=value */
105 elem = (Z_AttributeElement *) nmem_malloc(ct->nmem, sizeof(*elem));
106 elem->attributeSet = 0;
108 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
109 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
110 t = yaz_tok_move(tp);
111 if (t == YAZ_TOK_EOF)
113 wrbuf_destroy(type_str);
115 wrbuf_destroy(set_str);
118 if (t == YAZ_TOK_STRING)
120 wrbuf_puts(ct->w, " ");
121 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
125 yaz_string_to_oid_nmem(yaz_oid_std(), CLASS_ATTSET,
126 wrbuf_cstr(set_str), ct->nmem);
128 type_str = wrbuf_alloc();
129 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
130 t = yaz_tok_move(tp);
132 elem->attributeType = nmem_intdup(ct->nmem, 0);
133 if (sscanf(wrbuf_cstr(type_str), ODR_INT_PRINTF, elem->attributeType)
136 wrbuf_destroy(type_str);
138 wrbuf_destroy(set_str);
139 yaz_log(YLOG_WARN, "Expected numeric attribute type");
144 wrbuf_destroy(type_str);
146 wrbuf_destroy(set_str);
150 yaz_log(YLOG_WARN, "Expected = after after attribute type");
154 t = yaz_tok_move(tp);
155 if (t != YAZ_TOK_STRING) /* value */
157 yaz_log(YLOG_WARN, "Missing attribute value");
161 value_str = yaz_tok_parse_string(tp);
162 if (yaz_isdigit(*value_str))
164 elem->which = Z_AttributeValue_numeric;
165 elem->value.numeric =
166 nmem_intdup(ct->nmem, atoi(value_str));
170 Z_ComplexAttribute *ca = (Z_ComplexAttribute *)
171 nmem_malloc(ct->nmem, sizeof(*ca));
172 elem->which = Z_AttributeValue_complex;
173 elem->value.complex = ca;
175 ca->list = (Z_StringOrNumeric **)
176 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric *));
177 ca->list[0] = (Z_StringOrNumeric *)
178 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric));
179 ca->list[0]->which = Z_StringOrNumeric_string;
180 ca->list[0]->u.string = nmem_strdup(ct->nmem, value_str);
181 ca->num_semanticAction = 0;
182 ca->semanticAction = 0;
184 wrbuf_puts(ct->w, "=");
185 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
186 t = yaz_tok_move(tp);
187 wrbuf_puts(ct->w, " ");
190 if (ret == 0) /* OK? */
192 struct solr_prop_entry **pp = &ct->entry;
195 *pp = (struct solr_prop_entry *) xmalloc(sizeof(**pp));
196 (*pp)->pattern = xstrdup(pattern);
197 (*pp)->value = xstrdup(wrbuf_cstr(ct->w));
199 (*pp)->attr_list.num_attributes = ae_num;
201 (*pp)->attr_list.attributes = 0;
204 (*pp)->attr_list.attributes = (Z_AttributeElement **)
205 nmem_malloc(ct->nmem,
206 ae_num * sizeof(Z_AttributeElement *));
207 memcpy((*pp)->attr_list.attributes, ae,
208 ae_num * sizeof(Z_AttributeElement *));
214 ODR pr = odr_createmem(ODR_PRINT);
215 Z_AttributeList *alp = &(*pp)->attr_list;
216 odr_setprint(pr, yaz_log_file());
217 z_AttributeList(pr, &alp, 0, 0);
225 int solr_transform_define_pattern(solr_transform_t ct, const char *pattern,
229 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, value);
230 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
231 r = solr_transform_parse_tok_line(ct, pattern, tp);
232 yaz_tok_parse_destroy(tp);
236 solr_transform_t solr_transform_open_FILE(FILE *f)
238 solr_transform_t ct = solr_transform_create();
241 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
243 while (fgets(line, sizeof(line)-1, f))
245 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, line);
248 t = yaz_tok_move(tp);
249 if (t == YAZ_TOK_STRING)
251 char * pattern = xstrdup(yaz_tok_parse_string(tp));
252 t = yaz_tok_move(tp);
255 yaz_tok_parse_destroy(tp);
256 solr_transform_close(ct);
259 if (solr_transform_parse_tok_line(ct, pattern, tp))
261 yaz_tok_parse_destroy(tp);
262 solr_transform_close(ct);
267 else if (t != YAZ_TOK_EOF)
269 yaz_tok_parse_destroy(tp);
270 solr_transform_close(ct);
273 yaz_tok_parse_destroy(tp);
278 void solr_transform_close(solr_transform_t ct)
280 struct solr_prop_entry *pe;
286 struct solr_prop_entry *pe_next = pe->next;
293 yaz_tok_cfg_destroy(ct->tok_cfg);
294 wrbuf_destroy(ct->w);
295 nmem_destroy(ct->nmem);
299 solr_transform_t solr_transform_open_fname(const char *fname)
302 FILE *f = fopen(fname, "r");
305 ct = solr_transform_open_FILE(f);
310 static int compare_attr(Z_AttributeElement *a, Z_AttributeElement *b)
312 ODR odr_a = odr_createmem(ODR_ENCODE);
313 ODR odr_b = odr_createmem(ODR_ENCODE);
318 z_AttributeElement(odr_a, &a, 0, 0);
319 z_AttributeElement(odr_b, &b, 0, 0);
321 buf_a = odr_getbuf(odr_a, &len_a, 0);
322 buf_b = odr_getbuf(odr_b, &len_b, 0);
324 ret = yaz_memcmp(buf_a, buf_b, len_a, len_b);
331 const char *solr_lookup_reverse(solr_transform_t ct,
332 const char *category,
333 Z_AttributeList *attributes)
335 struct solr_prop_entry *e;
336 size_t clen = strlen(category);
337 for (e = ct->entry; e; e = e->next)
339 if (!strncmp(e->pattern, category, clen))
341 /* category matches.. See if attributes in pattern value
342 are all listed in actual attributes */
344 for (i = 0; i < e->attr_list.num_attributes; i++)
346 /* entry attribute */
347 Z_AttributeElement *e_ae = e->attr_list.attributes[i];
349 for (j = 0; j < attributes->num_attributes; j++)
351 /* actual attribute */
352 Z_AttributeElement *a_ae = attributes->attributes[j];
353 int r = compare_attr(e_ae, a_ae);
357 if (j == attributes->num_attributes)
358 break; /* i was not found at all.. try next pattern */
361 if (i == e->attr_list.num_attributes)
362 return e->pattern + clen;
368 static const char *solr_lookup_property(solr_transform_t ct,
369 const char *pat1, const char *pat2,
373 struct solr_prop_entry *e;
375 if (pat1 && pat2 && pat3)
376 sprintf(pattern, "%.39s.%.39s.%.39s", pat1, pat2, pat3);
377 else if (pat1 && pat2)
378 sprintf(pattern, "%.39s.%.39s", pat1, pat2);
379 else if (pat1 && pat3)
380 sprintf(pattern, "%.39s.%.39s", pat1, pat3);
382 sprintf(pattern, "%.39s", pat1);
386 for (e = ct->entry; e; e = e->next)
388 if (!solr_strcmp(e->pattern, pattern))
394 int solr_pr_attr_uri(solr_transform_t ct, const char *category,
395 const char *uri, const char *val, const char *default_val,
396 void (*pr)(const char *buf, void *client_data),
401 const char *eval = val ? val : default_val;
402 const char *prefix = 0;
406 struct solr_prop_entry *e;
408 for (e = ct->entry; e; e = e->next)
409 if (!memcmp(e->pattern, "set.", 4) && e->value &&
410 !strcmp(e->value, uri))
412 prefix = e->pattern+4;
415 /* must have a prefix now - if not it's an error */
421 res = solr_lookup_property(ct, category, prefix, eval);
422 /* we have some aliases for some relations unfortunately.. */
423 if (!res && !prefix && !strcmp(category, "relation"))
425 if (!strcmp(val, "=="))
426 res = solr_lookup_property(ct, category, prefix, "exact");
427 if (!strcmp(val, "="))
428 res = solr_lookup_property(ct, category, prefix, "eq");
429 if (!strcmp(val, "<="))
430 res = solr_lookup_property(ct, category, prefix, "le");
431 if (!strcmp(val, ">="))
432 res = solr_lookup_property(ct, category, prefix, "ge");
435 res = solr_lookup_property(ct, category, prefix, "*");
441 const char *cp0 = res, *cp1;
442 while ((cp1 = strchr(cp0, '=')))
445 while (*cp1 && *cp1 != ' ')
447 if (cp1 - cp0 >= (ptrdiff_t) sizeof(buf))
449 memcpy(buf, cp0, cp1 - cp0);
451 (*pr)("@attr ", client_data);
453 for (i = 0; buf[i]; i++)
456 (*pr)(eval, client_data);
462 (*pr)(tmp, client_data);
465 (*pr)(" ", client_data);
473 if (errcode && !ct->error)
477 ct->addinfo = xstrdup(val);
484 int solr_pr_attr(solr_transform_t ct, const char *category,
485 const char *val, const char *default_val,
486 void (*pr)(const char *buf, void *client_data),
490 return solr_pr_attr_uri(ct, category, 0 /* uri */,
491 val, default_val, pr, client_data, errcode);
495 static void solr_pr_int(int val,
496 void (*pr)(const char *buf, void *client_data),
499 char buf[21]; /* enough characters to 2^64 */
500 sprintf(buf, "%d", val);
501 (*pr)(buf, client_data);
502 (*pr)(" ", client_data);
506 static int solr_pr_prox(solr_transform_t ct, struct solr_node *mods,
507 void (*pr)(const char *buf, void *client_data),
513 int proxrel = 2; /* less than or equal */
514 int unit = 2; /* word */
518 const char *name = mods->u.st.index;
519 const char *term = mods->u.st.term;
520 const char *relation = mods->u.st.relation;
522 if (!strcmp(name, "distance")) {
523 distance = strtol(term, (char**) 0, 0);
524 if (!strcmp(relation, "="))
526 else if (!strcmp(relation, ">"))
528 else if (!strcmp(relation, "<"))
530 else if (!strcmp(relation, ">="))
532 else if (!strcmp(relation, "<="))
534 else if (!strcmp(relation, "<>"))
538 ct->error = YAZ_SRW_UNSUPP_PROX_RELATION;
539 ct->addinfo = xstrdup(relation);
543 else if (!strcmp(name, "ordered"))
545 else if (!strcmp(name, "unordered"))
547 else if (!strcmp(name, "unit"))
549 if (!strcmp(term, "word"))
551 else if (!strcmp(term, "sentence"))
553 else if (!strcmp(term, "paragraph"))
555 else if (!strcmp(term, "element"))
559 ct->error = YAZ_SRW_UNSUPP_PROX_UNIT;
560 ct->addinfo = xstrdup(term);
566 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
567 ct->addinfo = xstrdup(name);
570 mods = mods->u.st.modifiers;
574 distance = (unit == 2) ? 1 : 0;
576 solr_pr_int(exclusion, pr, client_data);
577 solr_pr_int(distance, pr, client_data);
578 solr_pr_int(ordered, pr, client_data);
579 solr_pr_int(proxrel, pr, client_data);
580 (*pr)("k ", client_data);
581 solr_pr_int(unit, pr, client_data);
586 /* Returns location of first wildcard character in the `length'
587 * characters starting at `term', or a null pointer of there are
588 * none -- like memchr().
590 static const char *wcchar(int start, const char *term, int length)
594 if (start || term[-1] != '\\')
595 if (strchr("*?", *term))
605 /* ### checks for SOLR relation-name rather than Type-1 attribute */
606 static int has_modifier(struct solr_node *cn, const char *name) {
607 struct solr_node *mod;
608 for (mod = cn->u.st.modifiers; mod != 0; mod = mod->u.st.modifiers) {
609 if (!strcmp(mod->u.st.index, name))
617 static void emit_term(solr_transform_t ct,
618 struct solr_node *cn,
619 const char *term, int length,
620 void (*pr)(const char *buf, void *client_data),
624 const char *ns = cn->u.st.index_uri;
625 int process_term = !has_modifier(cn, "regexp");
628 assert(cn->which == SOLR_NODE_ST);
630 if (process_term && length > 0)
632 if (length > 1 && term[0] == '^' && term[length-1] == '^')
634 solr_pr_attr(ct, "position", "firstAndLast", 0,
635 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
639 else if (term[0] == '^')
641 solr_pr_attr(ct, "position", "first", 0,
642 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
646 else if (term[length-1] == '^')
648 solr_pr_attr(ct, "position", "last", 0,
649 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
654 solr_pr_attr(ct, "position", "any", 0,
655 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
659 if (process_term && length > 0)
661 const char *first_wc = wcchar(1, term, length);
662 const char *second_wc = first_wc ?
663 wcchar(0, first_wc+1, length-(first_wc-term)-1) : 0;
665 /* Check for well-known globbing patterns that represent
666 * simple truncation attributes as expected by, for example,
667 * Bath-compliant server. If we find such a pattern but
668 * there's no mapping for it, that's fine: we just use a
669 * general pattern-matching attribute.
671 if (first_wc == term && second_wc == term + length-1
672 && *first_wc == '*' && *second_wc == '*'
673 && solr_pr_attr(ct, "truncation", "both", 0, pr, client_data, 0))
678 else if (first_wc == term && second_wc == 0 && *first_wc == '*'
679 && solr_pr_attr(ct, "truncation", "left", 0,
685 else if (first_wc == term + length-1 && second_wc == 0
687 && solr_pr_attr(ct, "truncation", "right", 0,
694 /* We have one or more wildcard characters, but not in a
695 * way that can be dealt with using only the standard
696 * left-, right- and both-truncation attributes. We need
697 * to translate the pattern into a Z39.58-type pattern,
698 * which has been supported in BIB-1 since 1996. If
699 * there's no configuration element for "truncation.z3958"
700 * we indicate this as error 28 "Masking character not
704 solr_pr_attr(ct, "truncation", "z3958", 0,
705 pr, client_data, YAZ_SRW_MASKING_CHAR_UNSUPP);
706 z3958_mem = (char *) xmalloc(length+1);
707 for (i = 0; i < length; i++)
709 if (i > 0 && term[i-1] == '\\')
710 z3958_mem[i] = term[i];
711 else if (term[i] == '*')
713 else if (term[i] == '?')
716 z3958_mem[i] = term[i];
718 z3958_mem[length] = '\0';
722 /* No masking characters. Use "truncation.none" if given. */
723 solr_pr_attr(ct, "truncation", "none", 0,
728 solr_pr_attr_uri(ct, "index", ns,
729 cn->u.st.index, "serverChoice",
730 pr, client_data, YAZ_SRW_UNSUPP_INDEX);
732 if (cn->u.st.modifiers)
734 struct solr_node *mod = cn->u.st.modifiers;
735 for (; mod; mod = mod->u.st.modifiers)
737 solr_pr_attr(ct, "relationModifier", mod->u.st.index, 0,
738 pr, client_data, YAZ_SRW_UNSUPP_RELATION_MODIFIER);
742 (*pr)("\"", client_data);
743 for (i = 0; i<length; i++)
745 /* pr(int) each character */
746 /* we do not need to deal with \-sequences because the
747 SOLR and PQF terms have same \-format, bug #1988 */
752 (*pr)(buf, client_data);
754 (*pr)("\" ", client_data);
758 static void emit_terms(solr_transform_t ct,
759 struct solr_node *cn,
760 void (*pr)(const char *buf, void *client_data),
764 struct solr_node *ne = cn->u.st.extra_terms;
767 (*pr)("@", client_data);
768 (*pr)(op, client_data);
769 (*pr)(" ", client_data);
771 emit_term(ct, cn, cn->u.st.term, strlen(cn->u.st.term),
773 for (; ne; ne = ne->u.st.extra_terms)
775 if (ne->u.st.extra_terms)
777 (*pr)("@", client_data);
778 (*pr)(op, client_data);
779 (*pr)(" ", client_data);
781 emit_term(ct, cn, ne->u.st.term, strlen(ne->u.st.term),
786 static void emit_wordlist(solr_transform_t ct,
787 struct solr_node *cn,
788 void (*pr)(const char *buf, void *client_data),
792 const char *cp0 = cn->u.st.term;
794 const char *last_term = 0;
800 cp1 = strchr(cp0, ' ');
803 (*pr)("@", client_data);
804 (*pr)(op, client_data);
805 (*pr)(" ", client_data);
806 emit_term(ct, cn, last_term, last_length, pr, client_data);
810 last_length = cp1 - cp0;
812 last_length = strlen(cp0);
816 emit_term(ct, cn, last_term, last_length, pr, client_data);
819 void solr_transform_r(solr_transform_t ct,
820 struct solr_node *cn,
821 void (*pr)(const char *buf, void *client_data),
825 struct solr_node *mods;
832 ns = cn->u.st.index_uri;
835 /* TODO If relevant fix with solr_uri */
836 if (!strcmp(ns, solr_uri())
837 && cn->u.st.index && !solr_strcmp(cn->u.st.index, "resultSet"))
839 (*pr)("@set \"", client_data);
840 (*pr)(cn->u.st.term, client_data);
841 (*pr)("\" ", client_data);
849 ct->error = YAZ_SRW_UNSUPP_CONTEXT_SET;
853 solr_pr_attr(ct, "always", 0, 0, pr, client_data, 0);
854 solr_pr_attr(ct, "relation", cn->u.st.relation, 0, pr, client_data,
855 YAZ_SRW_UNSUPP_RELATION);
856 solr_pr_attr(ct, "structure", cn->u.st.relation, 0,
857 pr, client_data, YAZ_SRW_UNSUPP_COMBI_OF_RELATION_AND_TERM);
858 if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "all"))
859 emit_wordlist(ct, cn, pr, client_data, "and");
860 else if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "any"))
861 emit_wordlist(ct, cn, pr, client_data, "or");
863 emit_terms(ct, cn, pr, client_data, "and");
866 (*pr)("@", client_data);
867 (*pr)(cn->u.boolean.value, client_data);
868 (*pr)(" ", client_data);
869 mods = cn->u.boolean.modifiers;
870 if (!strcmp(cn->u.boolean.value, "prox"))
872 if (!solr_pr_prox(ct, mods, pr, client_data))
877 /* Boolean modifiers other than on proximity not supported */
878 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
879 ct->addinfo = xstrdup(mods->u.st.index);
883 solr_transform_r(ct, cn->u.boolean.left, pr, client_data);
884 solr_transform_r(ct, cn->u.boolean.right, pr, client_data);
888 fprintf(stderr, "Fatal: impossible SOLR node-type %d\n", cn->which);
893 int solr_transform(solr_transform_t ct, struct solr_node *cn,
894 void (*pr)(const char *buf, void *client_data),
897 struct solr_prop_entry *e;
898 NMEM nmem = nmem_create();
904 for (e = ct->entry; e ; e = e->next)
906 /* TODO remove as SOLR dont supports sets.
907 if (!solr_strncmp(e->pattern, "set.", 4))
908 solr_apply_prefix(nmem, cn, e->pattern+4, e->value);
909 else if (!solr_strcmp(e->pattern, "set"))
910 solr_apply_prefix(nmem, cn, 0, e->value);
913 solr_transform_r(ct, cn, pr, client_data);
919 int solr_transform_FILE(solr_transform_t ct, struct solr_node *cn, FILE *f)
921 /* We can use the cql_fputs util */
922 return solr_transform(ct, cn, cql_fputs, f);
925 int solr_transform_buf(solr_transform_t ct, struct solr_node *cn, char *out, int max)
927 struct solr_buf_write_info info;
933 r = solr_transform(ct, cn, cql_buf_write_handler, &info);
935 /* Attempt to write past end of buffer. For some reason, this
936 SRW diagnostic is deprecated, but it's so perfect for our
937 purposes that it would be stupid not to use it. */
939 ct->error = YAZ_SRW_TOO_MANY_CHARS_IN_QUERY;
940 sprintf(numbuf, "%ld", (long) info.max);
941 ct->addinfo = xstrdup(numbuf);
945 info.buf[info.off] = '\0';
949 int solr_transform_error(solr_transform_t ct, const char **addinfo)
951 *addinfo = ct->addinfo;
955 void solr_transform_set_error(solr_transform_t ct, int error, const char *addinfo)
958 ct->addinfo = addinfo ? xstrdup(addinfo) : 0;
965 * c-file-style: "Stroustrup"
966 * indent-tabs-mode: nil
968 * vim: shiftwidth=4 tabstop=8 expandtab