1 /* $Id: xslt.c,v 1.9 2005-06-07 13:10:52 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
27 #include <yaz/diagbib1.h>
28 #include <libxml/xmlversion.h>
29 #include <libxml/parser.h>
30 #include <libxml/tree.h>
31 #include <libxml/xmlIO.h>
32 #include <libxml/xmlreader.h>
33 #include <libxslt/transform.h>
35 #include <idzebra/util.h>
36 #include <idzebra/recctrl.h>
38 struct filter_schema {
40 const char *identifier;
41 const char *stylesheet;
42 struct filter_schema *next;
43 const char *default_schema;
44 const char *include_snippet;
45 xsltStylesheetPtr stylesheet_xsp;
51 const char *split_level;
52 const char *split_path;
54 struct filter_schema *schemas;
55 xmlTextReaderPtr reader;
58 #define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
60 static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
62 static void set_param_xml(const char **params, const char *name,
63 const char *value, ODR odr)
72 static void set_param_str(const char **params, const char *name,
73 const char *value, ODR odr)
75 char *quoted = odr_malloc(odr, 3 + strlen(value));
76 sprintf(quoted, "'%s'", value);
84 static void set_param_int(const char **params, const char *name,
87 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
90 sprintf(quoted, "'" ZINT_FORMAT "'", value);
97 int zebra_xmlInputMatchCallback (char const *filename)
99 yaz_log(YLOG_LOG, "match %s", filename);
104 void * zebra_xmlInputOpenCallback (char const *filename)
109 int zebra_xmlInputReadCallback (void * context, char * buffer, int len)
114 int zebra_xmlInputCloseCallback (void * context)
123 static void *filter_init_xslt(Res res, RecType recType)
125 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
128 tinfo->split_level = 0;
129 tinfo->split_path = 0;
130 tinfo->odr = odr_createmem(ODR_ENCODE);
135 xmlRegisterDefaultInputCallbacks();
136 xmlRegisterInputCallbacks(zebra_xmlInputMatchCallback,
137 zebra_xmlInputOpenCallback,
138 zebra_xmlInputReadCallback,
139 zebra_xmlInputCloseCallback);
144 static void *filter_init_xslt1(Res res, RecType recType)
146 struct filter_info *tinfo = (struct filter_info *)
147 filter_init_xslt(res, recType);
148 tinfo->split_level = "1";
152 static int attr_content(struct _xmlAttr *attr, const char *name,
153 const char **dst_content)
155 if (!strcmp(attr->name, name) && attr->children &&
156 attr->children->type == XML_TEXT_NODE)
158 *dst_content = attr->children->content;
164 static void destroy_schemas(struct filter_info *tinfo)
166 struct filter_schema *schema = tinfo->schemas;
169 struct filter_schema *schema_next = schema->next;
170 if (schema->stylesheet_xsp)
171 xsltFreeStylesheet(schema->stylesheet_xsp);
173 schema = schema_next;
178 xmlFreeDoc(tinfo->doc);
182 static ZEBRA_RES create_schemas(struct filter_info *tinfo, const char *fname)
185 tinfo->fname = xstrdup(fname);
186 tinfo->doc = xmlParseFile(tinfo->fname);
189 ptr = xmlDocGetRootElement(tinfo->doc);
190 if (!ptr || ptr->type != XML_ELEMENT_NODE ||
191 strcmp(ptr->name, "schemaInfo"))
193 for (ptr = ptr->children; ptr; ptr = ptr->next)
195 if (ptr->type != XML_ELEMENT_NODE)
197 if (!strcmp(ptr->name, "schema"))
199 struct _xmlAttr *attr;
200 struct filter_schema *schema = xmalloc(sizeof(*schema));
202 schema->identifier = 0;
203 schema->stylesheet = 0;
204 schema->default_schema = 0;
205 schema->next = tinfo->schemas;
206 schema->stylesheet_xsp = 0;
207 schema->include_snippet = 0;
208 tinfo->schemas = schema;
209 for (attr = ptr->properties; attr; attr = attr->next)
211 attr_content(attr, "identifier", &schema->identifier);
212 attr_content(attr, "name", &schema->name);
213 attr_content(attr, "stylesheet", &schema->stylesheet);
214 attr_content(attr, "default", &schema->default_schema);
215 attr_content(attr, "snippet", &schema->include_snippet);
217 if (schema->stylesheet)
218 schema->stylesheet_xsp =
219 xsltParseStylesheetFile(
220 (const xmlChar*) schema->stylesheet);
222 else if (!strcmp(ptr->name, "split"))
224 struct _xmlAttr *attr;
225 for (attr = ptr->properties; attr; attr = attr->next)
227 attr_content(attr, "level", &tinfo->split_level);
228 attr_content(attr, "path", &tinfo->split_path);
233 yaz_log(YLOG_WARN, "Bad element %s in %s", ptr->name, fname);
240 static struct filter_schema *lookup_schema(struct filter_info *tinfo,
243 struct filter_schema *schema;
244 for (schema = tinfo->schemas; schema; schema = schema->next)
248 if (schema->identifier && !strcmp(schema->identifier, est))
250 if (schema->name && !strcmp(schema->name, est))
253 if (schema->default_schema)
259 static void filter_config(void *clientData, Res res, const char *args)
261 struct filter_info *tinfo = clientData;
263 args = "xsltfilter.xml";
264 if (tinfo->fname && !strcmp(args, tinfo->fname))
266 destroy_schemas(tinfo);
267 create_schemas(tinfo, args);
270 static void filter_destroy(void *clientData)
272 struct filter_info *tinfo = clientData;
273 destroy_schemas(tinfo);
275 xmlFreeTextReader(tinfo->reader);
276 odr_destroy(tinfo->odr);
280 static int ioread_ex(void *context, char *buffer, int len)
282 struct recExtractCtrl *p = context;
283 return (*p->readf)(p->fh, buffer, len);
286 static int ioclose_ex(void *context)
291 static void index_field(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
292 xmlNodePtr ptr, RecWord *recWord)
294 for(; ptr; ptr = ptr->next)
296 index_field(tinfo, ctrl, ptr->children, recWord);
297 if (ptr->type != XML_TEXT_NODE)
299 recWord->term_buf = ptr->content;
300 recWord->term_len = strlen(ptr->content);
301 (*ctrl->tokenAdd)(recWord);
305 static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
306 xmlNodePtr ptr, RecWord *recWord)
308 for(; ptr; ptr = ptr->next)
310 index_node(tinfo, ctrl, ptr->children, recWord);
311 if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
312 strcmp(ptr->ns->href, zebra_xslt_ns))
314 if (!strcmp(ptr->name, "index"))
317 const char *xpath_str = 0;
318 struct _xmlAttr *attr;
319 for (attr = ptr->properties; attr; attr = attr->next)
321 if (!strcmp(attr->name, "field")
322 && attr->children && attr->children->type == XML_TEXT_NODE)
323 field_str = attr->children->content;
324 if (!strcmp(attr->name, "xpath")
325 && attr->children && attr->children->type == XML_TEXT_NODE)
326 xpath_str = attr->children->content;
330 recWord->attrStr = field_str;
331 index_field(tinfo, ctrl, ptr->children, recWord);
337 static int extract_doc(struct filter_info *tinfo, struct recExtractCtrl *p,
341 const char *params[10];
345 struct filter_schema *schema = lookup_schema(tinfo, zebra_xslt_ns);
348 set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr);
350 (*p->init)(p, &recWord);
351 recWord.reg_type = 'w';
353 if (schema && schema->stylesheet_xsp)
356 xsltApplyStylesheet(schema->stylesheet_xsp,
358 if (p->flagShowRecords)
360 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
361 fwrite(buf_out, len_out, 1, stdout);
364 index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord);
367 xmlDocDumpMemory(doc, &buf_out, &len_out);
368 if (p->flagShowRecords)
369 fwrite(buf_out, len_out, 1, stdout);
370 (*p->setStoreData)(p, buf_out, len_out);
374 return RECCTRL_EXTRACT_OK;
377 static int extract_split(struct filter_info *tinfo, struct recExtractCtrl *p)
384 xmlFreeTextReader(tinfo->reader);
385 tinfo->reader = xmlReaderForIO(ioread_ex, ioclose_ex,
392 return RECCTRL_EXTRACT_ERROR_GENERIC;
394 if (tinfo->split_level)
395 split_depth = atoi(tinfo->split_level);
396 ret = xmlTextReaderRead(tinfo->reader);
398 int type = xmlTextReaderNodeType(tinfo->reader);
399 int depth = xmlTextReaderDepth(tinfo->reader);
400 if (split_depth == 0 ||
402 type == XML_READER_TYPE_ELEMENT && split_depth == depth))
404 xmlNodePtr ptr = xmlTextReaderExpand(tinfo->reader);
405 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
406 xmlDocPtr doc = xmlNewDoc("1.0");
408 xmlDocSetRootElement(doc, ptr2);
410 return extract_doc(tinfo, p, doc);
412 ret = xmlTextReaderRead(tinfo->reader);
414 xmlFreeTextReader(tinfo->reader);
416 return RECCTRL_EXTRACT_EOF;
419 static int extract_full(struct filter_info *tinfo, struct recExtractCtrl *p)
421 if (p->first_record) /* only one record per stream */
423 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
429 return RECCTRL_EXTRACT_ERROR_GENERIC;
431 return extract_doc(tinfo, p, doc);
434 return RECCTRL_EXTRACT_EOF;
437 static int filter_extract(void *clientData, struct recExtractCtrl *p)
439 struct filter_info *tinfo = clientData;
441 odr_reset(tinfo->odr);
443 if (tinfo->split_level == 0 && tinfo->split_path == 0)
444 return extract_full(tinfo, p);
447 return extract_split(tinfo, p);
451 static int ioread_ret(void *context, char *buffer, int len)
453 struct recRetrieveCtrl *p = context;
454 return (*p->readf)(p->fh, buffer, len);
457 static int ioclose_ret(void *context)
463 static const char *snippet_doc(struct recRetrieveCtrl *p, int text_mode,
466 const char *xml_doc_str;
468 WRBUF wrbuf = wrbuf_alloc();
469 zebra_snippets *res =
470 zebra_snippets_window(p->doc_snippet, p->hit_snippet, window_size);
471 zebra_snippet_word *w = zebra_snippets_list(res);
474 wrbuf_printf(wrbuf, "\'");
476 wrbuf_printf(wrbuf, "<snippet xmlns='%s'>\n", zebra_xslt_ns);
477 for (; w; w = w->next)
481 else if (ord != w->ord)
485 wrbuf_printf(wrbuf, "%s%s%s ",
488 w->match ? "*" : "");
491 wrbuf_printf(wrbuf, " <term ord='%d' seqno='" ZINT_FORMAT "' %s>",
493 (w->match ? "match='1'" : ""));
494 wrbuf_xmlputs(wrbuf, w->term);
495 wrbuf_printf(wrbuf, "</term>\n");
499 wrbuf_printf(wrbuf, "\'");
501 wrbuf_printf(wrbuf, "</snippet>\n");
503 xml_doc_str = odr_strdup(p->odr, wrbuf_buf(wrbuf));
505 zebra_snippets_destroy(res);
506 wrbuf_free(wrbuf, 1);
510 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
512 const char *esn = zebra_xslt_ns;
513 const char *params[10];
514 struct filter_info *tinfo = clientData;
517 struct filter_schema *schema;
518 int window_size = -1;
522 if (p->comp->which != Z_RecordComp_simple
523 || p->comp->u.simple->which != Z_ElementSetNames_generic)
525 p->diagnostic = YAZ_BIB1_PRESENT_COMP_SPEC_PARAMETER_UNSUPP;
528 esn = p->comp->u.simple->u.generic;
530 schema = lookup_schema(tinfo, esn);
534 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
538 if (schema->include_snippet)
539 window_size = atoi(schema->include_snippet);
542 set_param_str(params, "schema", esn, p->odr);
544 set_param_str(params, "filename", p->fname, p->odr);
546 set_param_int(params, "score", p->score, p->odr);
547 set_param_int(params, "size", p->recordSize, p->odr);
549 if (window_size >= 0)
550 set_param_xml(params, "snippet", snippet_doc(p, 1, window_size),
552 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
558 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
562 if (window_size >= 0)
564 xmlNodePtr node = xmlDocGetRootElement(doc);
565 const char *snippet_str = snippet_doc(p, 0, window_size);
566 xmlDocPtr snippet_doc = xmlParseMemory(snippet_str, strlen(snippet_str));
567 xmlAddChild(node, xmlDocGetRootElement(snippet_doc));
569 if (!schema->stylesheet_xsp)
573 resDoc = xsltApplyStylesheet(schema->stylesheet_xsp,
579 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
581 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
585 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
587 p->output_format = VAL_TEXT_XML;
588 p->rec_len = len_out;
589 p->rec_buf = odr_malloc(p->odr, p->rec_len);
590 memcpy(p->rec_buf, buf_out, p->rec_len);
594 else if (p->output_format == VAL_SUTRS)
598 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
600 p->output_format = VAL_SUTRS;
601 p->rec_len = len_out;
602 p->rec_buf = odr_malloc(p->odr, p->rec_len);
603 memcpy(p->rec_buf, buf_out, p->rec_len);
609 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
615 static struct recType filter_type_xslt = {
625 static struct recType filter_type_xslt1 = {
636 #ifdef IDZEBRA_STATIC_XSLT
644 #ifdef LIBXML_READER_ENABLED