From: Jason Skomorowski Date: Fri, 18 Sep 2009 14:02:19 +0000 (-0400) Subject: First stab at a marc map as a fast alternat to XSLT X-Git-Tag: v1.2.1~40 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;h=2c07c3edbd2fb4c6f72b6527632fab264b6ab66b;p=pazpar2-moved-to-github.git First stab at a marc map as a fast alternat to XSLT --- diff --git a/etc/marc21.mmap b/etc/marc21.mmap new file mode 100644 index 0000000..17a5720 --- /dev/null +++ b/etc/marc21.mmap @@ -0,0 +1,83 @@ +001 $ id +010 a lccn +020 a isbn +022 a issn +027 a tech-rep-nr +035 a system-control-nr +100 a author +100 c author-title +100 d author-date +110 a corporate-name +110 c corporate-location +110 d corporate-date +111 a meeting-name +111 c meeting-location +111 d meeting-date +260 c date +245 a title +245 b title-remainder +245 c title-responsibility +245 f title-dates +245 h title-medium +245 n title-number-section +250 a edition +260 a publication-place +260 b publication-name +260 c publication-date +300 a physical-extent +300 b physical-format +300 c physical-dimensions +300 e physical-accomp +300 f physical-unittype +300 g physical-unitsize +300 3 physical-specified +440 a series-title +500 $ description +505 $ description +518 $ description +520 $ description +522 $ description +600 a subject +600 a subject +610 a subject +610 a subject +611 a subject +611 a subject +630 a subject +630 a subject +648 a subject +648 a subject +650 a subject +650 * subject-long +651 a subject +651 * subject-long +653 a subject +653 * subject-long +654 a subject +654 * subject-long +655 a subject +655 * subject-long +656 a subject +656 * subject-long +657 a subject +657 * subject-long +658 a subject +658 * subject-long +662 a subject +662 * subject-long +69X a subject +69X * subject-long +773 * citation +856 u electronic-url +856 y electronic-text +856 3 electronic-text +856 z electronic-note +852 y publicnote +852 h callnumber +900 a fulltext +900 b fulltext +901 a iii-id +907 a iii-id +926 * holding +948 * holding +991 * holding diff --git a/src/Makefile.am b/src/Makefile.am index 81dbe02..9df6ee1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -27,7 +27,7 @@ libpazpar2_a_SOURCES = pazpar2_config.c pazpar2_config.h eventl.c eventl.h \ settings.h settings.c sel_thread.c sel_thread.h getaddrinfo.c \ charsets.c charsets.h \ client.c client.h connection.c connection.h host.h parameters.h \ - dirent.c direntz.h + dirent.c direntz.h marcmap.c marcmap.h marchash.c marchash.h pazpar2_SOURCES = pazpar2.c pazpar2_LDADD = libpazpar2.a $(YAZLIB) diff --git a/src/logic.c b/src/logic.c index f565706..07fac52 100644 --- a/src/logic.c +++ b/src/logic.c @@ -68,6 +68,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "client.h" #include "settings.h" #include "normalize7bit.h" +#include "marcmap.h" #define TERMLIST_HIGH_SCORE 25 @@ -271,8 +272,17 @@ xmlDoc *normalize_record(struct session_database *sdb, struct session *se, insert_settings_parameters(sdb, se, parms); - new = xsltApplyStylesheet(m->stylesheet, rdoc, (const char **) parms); - root= xmlDocGetRootElement(new); + if (m->stylesheet) + { + new = xsltApplyStylesheet(m->stylesheet, rdoc, (const char **) parms); + } + else if (m->marcmap) + { + new = marcmap_apply(m->marcmap, rdoc); + } + + root = xmlDocGetRootElement(new); + if (!new || !root || !(root->children)) { yaz_log(YLOG_WARN, "XSLT transformation failed from %s", @@ -356,12 +366,30 @@ static int prepare_map(struct session *se, struct session_database *sdb) { (*m) = nmem_malloc(se->session_nmem, sizeof(**m)); (*m)->next = 0; - if (!((*m)->stylesheet = conf_load_stylesheet(stylesheets[i]))) + + // XSLT + if (!strcmp(&stylesheets[i][strlen(stylesheets[i])-4], ".xsl")) + { + (*m)->marcmap = NULL; + if (!((*m)->stylesheet = conf_load_stylesheet(stylesheets[i]))) + { + yaz_log(YLOG_FATAL|YLOG_ERRNO, "Unable to load stylesheet: %s", + stylesheets[i]); + return -1; + } + } + // marcmap + else if (!strcmp(&stylesheets[i][strlen(stylesheets[i])-5], ".mmap")) { - yaz_log(YLOG_FATAL|YLOG_ERRNO, "Unable to load stylesheet: %s", - stylesheets[i]); - return -1; + (*m)->stylesheet = NULL; + if (!((*m)->marcmap = marcmap_load(stylesheets[i], se->session_nmem))) + { + yaz_log(YLOG_FATAL|YLOG_ERRNO, "Unable to load marcmap: %s", + stylesheets[i]); + return -1; + } } + m = &(*m)->next; } } diff --git a/src/marchash.c b/src/marchash.c new file mode 100644 index 0000000..5def520 --- /dev/null +++ b/src/marchash.c @@ -0,0 +1,229 @@ +#include +#include +#include + +#include +#include +#include + +#include + +// Jenkins one-at-a-time hash (from pp2 reclists.c, wikipedia) +static unsigned int hash(const unsigned char *key) +{ + unsigned int hash = 0; + + while (*key) + { + hash += *(key++); + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + return hash; +} + +inline char *strtrimcat (char *dest, char *src) +{ + char *in; + char *out; + char *last_nonspace; + in = src; + out = dest; + // move to end of dest + while (*out) + out++; + // initialise last non-space charater + last_nonspace = out; + // skip leading whitespace + while (isspace(*in)) + in++; + while (*in) + { + *out = *in; + if (!isspace(*in)) + last_nonspace = out; + out++; + in++; + } + *(++last_nonspace) = '\0'; +} + +inline char *strtrimcpy (char *dest, char *src) +{ + *dest = '\0'; + strtrimcat(dest, src); +} + +struct marchash *marchash_create (NMEM nmem) +{ + struct marchash *new; + new = nmem_malloc(nmem, sizeof (struct marchash)); + memset(new, 0, sizeof (struct marchash)); + new->nmem = nmem; + return new; +} + +int marchash_ingest_marcxml (struct marchash *marchash, xmlNodePtr rec_node) +{ + xmlNodePtr field_node; + xmlNodePtr sub_node; + field_node = rec_node->children; + struct marcfield *field; + + while (field_node) + { + if (field_node->type == XML_ELEMENT_NODE) + { + field = NULL; + if (!strcmp(field_node->name, "controlfield")) + { + field = marchash_add_field(marchash, xmlGetProp(field_node, "tag"), xmlNodeGetContent(field_node)); + } + else if (!strcmp(field_node->name, "datafield")) + { + field = marchash_add_field(marchash, xmlGetProp(field_node, "tag"), xmlNodeGetContent(field_node)); + } + if (field) + { + sub_node = field_node->children; + while (sub_node) + { + if ((sub_node->type == XML_ELEMENT_NODE) && (!strcmp(sub_node->name, "subfield"))) + { + marchash_add_subfield(marchash, field, xmlGetProp(sub_node, "code")[0], xmlNodeGetContent(sub_node)); + } + sub_node = sub_node->next; + } + } + } + field_node = field_node->next; + } +} + +struct marcfield *marchash_add_field (struct marchash *marchash, char *key, char *val) +{ + int slot; + struct marcfield *new; + struct marcfield *last; + + slot = hash(key) & MARCHASH_MASK; + new = marchash->table[slot]; + last = NULL; + + while (new) + { + last = new; + new = new->next; + } + + new = nmem_malloc(marchash->nmem, sizeof (struct marcfield)); + + if (last) + last->next = new; + else + marchash->table[slot] = new; + + new->next = NULL; + new->subfields = NULL; + strncpy(new->key, key, 4); + + // only 3 char in a marc field name + if (new->key[3] != '\0') + return 0; + + new->val = nmem_malloc(marchash->nmem, sizeof (char) * strlen(val) + 1); + strtrimcpy(new->val, val); + + return new; +} + +struct marcsubfield *marchash_add_subfield (struct marchash *marchash, struct marcfield *field, char key, char *val) +{ + struct marcsubfield *new; + struct marcsubfield *last; + last = NULL; + new = field->subfields; + + while (new) + { + last = new; + new = new->next; + } + + new = nmem_malloc(marchash->nmem, sizeof (struct marcsubfield)); + + if (last) + last->next = new; + else + field->subfields = new; + + new->next = NULL; + new->key = key; + new->val = nmem_malloc(marchash->nmem, sizeof (char) * strlen(val) + 1); + strcpy(new->val, val); + return new; +} + +struct marcfield *marchash_get_field (struct marchash *marchash, char *key, struct marcfield *last) +{ + struct marcfield *cur; + if (last) + cur = last->next; + else + cur = marchash->table[hash(key) & MARCHASH_MASK]; + while (cur) + { + if (!strcmp(cur->key, key)) + return cur; + cur = cur->next; + } + return NULL; +} + +struct marcsubfield *marchash_get_subfield (char key, struct marcfield *field, struct marcsubfield *last) +{ + struct marcsubfield *cur; + if (last) + cur = last->next; + else + cur = field->subfields; + while (cur) + { + if (cur->key == key) + return cur; + cur = cur->next; + } + return NULL; +} + +char *marchash_catenate_subfields (struct marcfield *field, char *delim, NMEM nmem) +{ + char *output; + struct marcsubfield *cur; + int delimsize = strlen(delim); + int outsize = 1-delimsize; + // maybe it would make sense to have an nmem strcpy/strcat? + cur = field -> subfields; + while (cur) + { + outsize += strlen(cur->val) + delimsize; + cur = cur->next; + } + if (outsize > 0) + output = nmem_malloc(nmem, outsize); + else + return NULL; + *output = '\0'; + cur = field -> subfields; + while (cur) + { + strtrimcat(output, cur->val); + if (cur->next) + strcat(output, delim); + cur = cur->next; + } + return output; +} diff --git a/src/marchash.h b/src/marchash.h new file mode 100644 index 0000000..74b7bbd --- /dev/null +++ b/src/marchash.h @@ -0,0 +1,33 @@ +#ifndef MARCHASH_H +#define MARCHASH_H + +#define MARCHASH_MASK 127 + +struct marchash +{ + struct marcfield *table[MARCHASH_MASK + 1]; + NMEM nmem; +}; + +struct marcfield +{ + char key[4]; + char *val; + struct marcsubfield *subfields; + struct marcfield *next; +}; + +struct marcsubfield +{ + char key; + char *val; + struct marcsubfield *next; +}; + +struct marchash *marchash_create (NMEM nmem); +int marchash_ingest_marcxml (struct marchash *marchash, xmlNodePtr rec_node); +struct marcfield *marchash_add_field (struct marchash *marchash, char *key, char *value); +struct marcsubfield *marchash_add_subfield (struct marchash *marchash, struct marcfield *field, char key, char *value); +struct marcfield *marchash_get_field (struct marchash *marchash, char *key, struct marcfield *last); +struct marcsubfield *marchash_get_subfield (char key, struct marcfield *field, struct marcsubfield *last); +#endif diff --git a/src/marcmap.c b/src/marcmap.c new file mode 100644 index 0000000..a468d3e --- /dev/null +++ b/src/marcmap.c @@ -0,0 +1,191 @@ +#include +#include +#include + +#include +#include + +#include + +#include +#include + +struct marcmap *marcmap_load(char *filename, NMEM nmem) { + struct marcmap *mm; + struct marcmap *mmhead; + FILE *fp; + char c; + char buf[256]; + int len; + int field; + int newrec; + + len = 0; + field = 0; + newrec = 1; + mm = NULL; + mmhead = NULL; + fp = fopen(filename, "r"); + + while ((c = getc(fp) ) != EOF) + { + // allocate some space + if (newrec) + { + if (mm != NULL) + { + mm->next = nmem_malloc(nmem, sizeof(struct marcmap)); + mm = mm->next; + } + // first one! + else + { mm = nmem_malloc(nmem, sizeof(struct marcmap)); + mmhead = mm; + } + newrec = 0; + } + // whitespace saves and moves on + if (c == ' ' || c == '\n' || c == '\t') + { + buf[len] = '\0'; + len++; + // first field, marc + if (field == 0) + { + // allow blank lines + if (!(len <3)) + { + mm->field = nmem_malloc(nmem, len * sizeof(char)); + strncpy(mm->field, buf, len); + } + } + // second, marc subfield, just a char + else if (field == 1) + { + mm->subfield = buf[len-2]; + } + // third, pz fieldname + else if (field == 2) + { + mm->pz = nmem_malloc(nmem, len * sizeof(char)); + strncpy(mm->pz, buf, len); + } + + // new line, new record + if (c == '\n') + { + field = 0; + newrec = 1; + } + else + { + field++; + } + len = 0; + } + else + { + buf[len] = c; + len++; + } + } + mm->next = NULL; + return mmhead; +} + +xmlDoc *marcmap_apply(struct marcmap *marcmap, xmlDoc *xml_in) +{ + char mergekey[1024]; + char medium[32]; + char *s; + NMEM nmem; + xmlNsPtr ns_pz; + xmlDocPtr xml_out; + xmlNodePtr xml_out_root; + xmlNodePtr rec_node; + xmlNodePtr meta_node; + struct marchash *marchash; + struct marcfield *field; + struct marcsubfield *subfield; + struct marcmap *mmcur; + + xml_out = xmlNewDoc(BAD_CAST "1.0"); + xml_out_root = xmlNewNode(NULL, BAD_CAST "record"); + xmlDocSetRootElement(xml_out, xml_out_root); + ns_pz = xmlNewNs(xml_out_root, BAD_CAST "http://www.indexdata.com/pazpar2/1.0", BAD_CAST "pz"); + nmem = nmem_create(); + rec_node = xmlDocGetRootElement(xml_in); + marchash = marchash_create(nmem); + marchash_ingest_marcxml(marchash, rec_node); + + mmcur = marcmap; + while (mmcur != NULL) + { + if (field = marchash_get_field(marchash, mmcur->field, NULL)) + do + { + // field value + if ((mmcur->subfield == '$') && (s = field->val)) + { + meta_node = xmlNewChild(xml_out_root, ns_pz, BAD_CAST "metadata", s); + xmlSetProp(meta_node, BAD_CAST "type", mmcur->pz); + } + // catenate all subfields + else if ((mmcur->subfield == '*') && (s = marchash_catenate_subfields(field, " ", nmem))) + { + meta_node = xmlNewChild(xml_out_root, ns_pz, BAD_CAST "metadata", s); + xmlSetProp(meta_node, BAD_CAST "type", mmcur->pz); + } + // subfield value + else if (mmcur->subfield) + { + if (subfield = marchash_get_subfield(mmcur->subfield, field, NULL)) + do + if (s = subfield->val) + { + meta_node = xmlNewChild(xml_out_root, ns_pz, BAD_CAST "metadata", s); + xmlSetProp(meta_node, BAD_CAST "type", mmcur->pz); + } + while (subfield = marchash_get_subfield(mmcur->subfield, field, subfield)); + } + + } + while (field = marchash_get_field(marchash, mmcur->field, field)); + mmcur = mmcur->next; + } + + // hard coded mappings + + // medium + if ((field = marchash_get_field(marchash, "245", NULL)) && (subfield = marchash_get_subfield('h', field, NULL))) + { + strncpy(medium, subfield->val, 32); + } + else if ((field = marchash_get_field(marchash, "900", NULL)) && (subfield = marchash_get_subfield('a', field, NULL))) + strcpy(medium, "electronic resource"); + else if ((field = marchash_get_field(marchash, "900", NULL)) && (subfield = marchash_get_subfield('b', field, NULL))) + strcpy(medium, "electronic resource"); + else if ((field = marchash_get_field(marchash, "773", NULL)) && (subfield = marchash_get_subfield('t', field, NULL))) + strcpy(medium, "article"); + else + strcpy(medium, "book"); + + meta_node = xmlNewChild(xml_out_root, ns_pz, BAD_CAST "metadata", BAD_CAST medium); + xmlSetProp(meta_node, BAD_CAST "type", BAD_CAST "medium"); + + // merge key + memset(mergekey, 0, 1024); + strcpy(mergekey, "title "); + if ((field = marchash_get_field(marchash, "245", NULL)) && (subfield = marchash_get_subfield('a', field, NULL))) + strncat(mergekey, subfield->val, 1023 - strlen(mergekey)); + strncat(mergekey, " author ", 1023 - strlen(mergekey)); + if ((field = marchash_get_field(marchash, "245", NULL)) && (subfield = marchash_get_subfield('a', field, NULL))) + strncat(mergekey, subfield->val, 1023 - strlen(mergekey)); + strncat(mergekey, " medium ", 1023 - strlen(mergekey)); + strncat(mergekey, medium, 1023 - strlen(mergekey)); + + xmlSetProp(xml_out_root, BAD_CAST "mergekey", BAD_CAST mergekey); + + nmem_destroy(nmem); + return xml_out; +} diff --git a/src/marcmap.h b/src/marcmap.h new file mode 100644 index 0000000..11eb34e --- /dev/null +++ b/src/marcmap.h @@ -0,0 +1,15 @@ +#ifndef MARCMAP_H +#define MARCMAP_H + +struct marcmap +{ + char *field; + char subfield; + char *pz; + struct marcmap *next; +}; + +struct marcmap *marcmap_load(char *filename, NMEM nmem); +xmlDoc *marcmap_apply(struct marcmap *marcmap, xmlDoc *xml_in); + +#endif diff --git a/src/pazpar2.h b/src/pazpar2.h index 2a1f65c..39f513d 100644 --- a/src/pazpar2.h +++ b/src/pazpar2.h @@ -81,6 +81,7 @@ struct database_criterion { // Simple sequence of stylesheets run in series. struct database_retrievalmap { xsltStylesheet *stylesheet; + struct marcmap *marcmap; struct database_retrievalmap *next; };