From: Adam Dickmeiss Date: Tue, 11 Oct 2011 13:25:40 +0000 (+0200) Subject: More work on sorting X-Git-Tag: v1.6.4~5^2~13 X-Git-Url: http://lists.indexdata.com/cgi-bin?a=commitdiff_plain;h=9d9593c47d0ab73b331d85c64e04b516ba9c6a7f;p=pazpar2-moved-to-github.git More work on sorting Session only searches once for each sort criteria. Each record is matched against all records in cluster, to avoid duplicates. --- diff --git a/src/pazpar2_config.h b/src/pazpar2_config.h index cc072be..f3c346b 100644 --- a/src/pazpar2_config.h +++ b/src/pazpar2_config.h @@ -30,7 +30,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA enum conf_metadata_type { Metadata_type_generic, // Generic text field - Metadata_type_number, // A number Metadata_type_year, // A number Metadata_type_date // A number }; diff --git a/src/reclists.c b/src/reclists.c index 99c9c87..d063c17 100644 --- a/src/reclists.c +++ b/src/reclists.c @@ -314,6 +314,16 @@ struct record_cluster *reclist_insert(struct reclist *l, if (!strcmp(merge_key, (*p)->record->merge_key)) { struct record_cluster *existing = (*p)->record; + struct record *re = existing->records; + + for (; re; re = re->next) + { + if (record_compare(record, re, service)) + { + yaz_mutex_leave(l->mutex); + return 0; + } + } record->next = existing->records; existing->records = record; cluster = existing; diff --git a/src/record.c b/src/record.c index c43af42..874a0a2 100644 --- a/src/record.c +++ b/src/record.c @@ -90,98 +90,41 @@ struct record_metadata * record_metadata_create(NMEM nmem) } -struct record_metadata * record_metadata_insert(NMEM nmem, - struct record_metadata ** rmd, - union data_types data) +int record_compare(struct record *r1, struct record *r2, + struct conf_service *service) { - struct record_metadata * tmp_rmd = 0; - // assert(nmem); - - if(!rmd) - return 0; - - // construct new record_metadata - tmp_rmd = nmem_malloc(nmem, sizeof(struct record_metadata)); - tmp_rmd->data = data; - - - // insert in *rmd's place, moving *rmd one down the list - tmp_rmd->next = *rmd; - *rmd = tmp_rmd; - - return *rmd; -} - -struct record_metadata * record_add_metadata_field_id(NMEM nmem, - struct record * record, - int field_id, - union data_types data) -{ - if (field_id < 0 || !record || !record->metadata) - return 0; - - return record_metadata_insert(nmem, &(record->metadata[field_id]), data); -} - - -struct record_metadata * record_add_metadata(NMEM nmem, - struct record * record, - struct conf_service * service, - const char * name, - union data_types data) -{ - int field_id = 0; - - if (!record || !record->metadata || !service || !name) - return 0; - - field_id = conf_service_metadata_field_id(service, name); - - if (-1 == field_id) - return 0; - - return record_metadata_insert(nmem, &(record->metadata[field_id]), data); -} - - - - - - -union data_types * record_assign_sortkey_field_id(NMEM nmem, - struct record * record, - int field_id, - union data_types data) -{ - if (field_id < 0 || !record || !record->sortkeys) - return 0; - - return data_types_assign(nmem, &(record->sortkeys[field_id]), data); -} - - - -union data_types * record_assign_sortkey(NMEM nmem, - struct record * record, - struct conf_service * service, - const char * name, - union data_types data) -{ - int field_id = 0; - - if (!record || !service || !name) - return 0; - - field_id = conf_service_sortkey_field_id(service, name); - - if (!(-1 < field_id) || !(field_id < service->num_sortkeys)) - return 0; - - return record_assign_sortkey_field_id(nmem, record, field_id, data); + int i; + for (i = 0; i < service->num_metadata; i++) + { + struct conf_metadata *ser_md = &service->metadata[i]; + enum conf_metadata_type type = ser_md->type; + + struct record_metadata *m1 = r1->metadata[i]; + struct record_metadata *m2 = r2->metadata[i]; + while (m1 && m2) + { + switch (type) + { + case Metadata_type_generic: + if (strcmp(m1->data.text.disp, m2->data.text.disp)) + return 0; + break; + case Metadata_type_date: + case Metadata_type_year: + if (m1->data.number.min != m2->data.number.min || + m1->data.number.max != m2->data.number.max) + return 0; + break; + } + m1 = m1->next; + m2 = m2->next; + } + if (m1 || m2) + return 0; + } + return 1; } - - /* * Local variables: * c-basic-offset: 4 diff --git a/src/record.h b/src/record.h index 3960f72..b7efc2e 100644 --- a/src/record.h +++ b/src/record.h @@ -72,38 +72,7 @@ struct record * record_create(NMEM nmem, int num_metadata, int num_sortkeys, struct record_metadata * record_metadata_create(NMEM nmem); -struct record_metadata * record_metadata_insert(NMEM nmem, - struct record_metadata ** rmd, - union data_types data); - - -struct record_metadata * record_add_metadata_field_id(NMEM nmem, - struct record * record, - int field_id, - union data_types data); - - -struct record_metadata * record_add_metadata(NMEM nmem, - struct record * record, - struct conf_service * service, - const char * name, - union data_types data); - - -union data_types * record_assign_sortkey_field_id(NMEM nmem, - struct record * record, - int field_id, - union data_types data); - - -union data_types * record_assign_sortkey(NMEM nmem, - struct record * record, - struct conf_service * service, - const char * name, - union data_types data); - - - +int record_compare(struct record *r1, struct record *r2, struct conf_service *service); struct record_cluster { @@ -120,9 +89,6 @@ struct record_cluster struct record *records; }; - - - #endif // RECORD_H /* diff --git a/src/session.c b/src/session.c index d18d6aa..c9628e4 100644 --- a/src/session.c +++ b/src/session.c @@ -95,6 +95,12 @@ struct client_list { struct client_list *next; }; +struct session_sorted_results { + const char *field; + int increasing; + struct session_sorted_results *next; +}; + /* session counting (1) , disable client counting (0) */ static YAZ_MUTEX g_session_mutex = 0; static int no_sessions = 0; @@ -593,10 +599,32 @@ int session_is_preferred_clients_ready(struct session *s) void search_sort(struct session *se, const char *field, int increasing) { + struct session_sorted_results *sr; struct client_list *l; - struct timeval tval; session_enter(se); + + /* see if we already have sorted for this critieria */ + for (sr = se->sorted_results; sr; sr = sr->next) + { + if (!strcmp(field, sr->field) && increasing == sr->increasing) + break; + } + if (sr) + { + yaz_log(YLOG_LOG, "search_sort: field=%s increasing=%d already fetched", + field, increasing); + session_leave(se); + return; + } + yaz_log(YLOG_LOG, "search_sort: field=%s increasing=%d must fetch", + field, increasing); + sr = nmem_malloc(se->nmem, sizeof(*sr)); + sr->field = nmem_strdup(se->nmem, field); + sr->increasing = increasing; + sr->next = se->sorted_results; + se->sorted_results = sr; + for (l = se->clients; l; l = l->next) { struct client *cl = l->client; @@ -619,9 +647,10 @@ void search_sort(struct session *se, const char *field, int increasing) break; } } - + if (strategy_plus_sort) { + struct timeval tval; if (client_prep_connection(cl, se->service->z3950_operation_timeout, se->service->z3950_session_timeout, se->service->server->iochan_man, @@ -630,7 +659,7 @@ void search_sort(struct session *se, const char *field, int increasing) char **array; int num; nmem_strsplit(se->nmem, ":", strategy_plus_sort, &array, &num); - + if (num == 2) { const char *sort_spec = array[1]; @@ -671,6 +700,13 @@ enum pazpar2_error_code search(struct session *se, nmem_reset(se->nmem); se->total_records = se->total_merged = 0; se->num_termlists = 0; + + /* reset list of sorted results and clear to relevance search */ + se->sorted_results = nmem_malloc(se->nmem, sizeof(*se->sorted_results)); + se->sorted_results->field = nmem_strdup(se->nmem, "relevance"); + se->sorted_results->increasing = 0; + se->sorted_results->next = 0; + live_channels = select_targets(se, filter); if (!live_channels) { @@ -1481,7 +1517,7 @@ int ingest_record(struct client *cl, const char *rec, } session_enter(se); if (client_get_session(cl) == se) - ret = ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm); + ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm); session_leave(se); xmlFreeDoc(xdoc); @@ -1497,25 +1533,81 @@ static int ingest_to_cluster(struct client *cl, xmlNode *n; xmlChar *type = 0; xmlChar *value = 0; - struct session_database *sdb = client_get_database(cl); struct session *se = client_get_session(cl); struct conf_service *service = se->service; struct record *record = record_create(se->nmem, service->num_metadata, service->num_sortkeys, cl, record_no); + + for (n = root->children; n; n = n->next) + { + if (type) + xmlFree(type); + if (value) + xmlFree(value); + type = value = 0; + + if (n->type != XML_ELEMENT_NODE) + continue; + if (!strcmp((const char *) n->name, "metadata")) + { + struct conf_metadata *ser_md = 0; + struct record_metadata **wheretoput = 0; + struct record_metadata *rec_md = 0; + int md_field_id = -1; + + type = xmlGetProp(n, (xmlChar *) "type"); + value = xmlNodeListGetString(xdoc, n->children, 1); + + if (!type || !value || !*value) + continue; + + md_field_id + = conf_service_metadata_field_id(service, (const char *) type); + if (md_field_id < 0) + { + if (se->number_of_warnings_unknown_metadata == 0) + { + session_log(se, YLOG_WARN, + "Ignoring unknown metadata element: %s", type); + } + se->number_of_warnings_unknown_metadata++; + continue; + } + + ser_md = &service->metadata[md_field_id]; + + // non-merged metadata + rec_md = record_metadata_init(se->nmem, (const char *) value, + ser_md->type, n->properties); + if (!rec_md) + { + session_log(se, YLOG_WARN, "bad metadata data '%s' " + "for element '%s'", value, type); + continue; + } + wheretoput = &record->metadata[md_field_id]; + while (*wheretoput) + wheretoput = &(*wheretoput)->next; + *wheretoput = rec_md; + } + } + struct record_cluster *cluster = reclist_insert(se->reclist, service, record, mergekey_norm, &se->total_merged); + if (!cluster) + return -1; - const char *use_term_factor_str = session_setting_oneval(sdb, PZ_TERMLIST_TERM_FACTOR); - int use_term_factor = 0; - int term_factor = 1; - if (use_term_factor_str && use_term_factor_str[0] != 0) - use_term_factor = atoi(use_term_factor_str); - if (use_term_factor) { + struct session_database *sdb = client_get_database(cl); + int term_factor = 1; + const char *use_term_factor_str = + session_setting_oneval(sdb, PZ_TERMLIST_TERM_FACTOR); + if (use_term_factor_str && use_term_factor_str[0] == '1') + { int maxrecs = client_get_maxrecs(cl); int hits = (int) client_get_hits(cl); term_factor = MAX(hits, maxrecs) / MAX(1, maxrecs); @@ -1523,11 +1615,11 @@ static int ingest_to_cluster(struct client *cl, yaz_log(YLOG_DEBUG, "Using term factor: %d (%d / %d)", term_factor, MAX(hits, maxrecs), MAX(1, maxrecs)); } - if (!cluster) - return -1; if (global_parameters.dump_records) session_log(se, YLOG_LOG, "Cluster id %s from %s (#%d)", cluster->recid, sdb->database->id, record_no); + + relevance_newrec(se->relevance, cluster); // now parsing XML record and adding data to cluster or record metadata @@ -1560,37 +1652,16 @@ static int ingest_to_cluster(struct client *cl, md_field_id = conf_service_metadata_field_id(service, (const char *) type); if (md_field_id < 0) - { - if (se->number_of_warnings_unknown_metadata == 0) - { - session_log(se, YLOG_WARN, - "Ignoring unknown metadata element: %s", type); - } - se->number_of_warnings_unknown_metadata++; continue; - } ser_md = &service->metadata[md_field_id]; - if (ser_md->sortkey_offset >= 0){ + if (ser_md->sortkey_offset >= 0) + { sk_field_id = ser_md->sortkey_offset; ser_sk = &service->sortkeys[sk_field_id]; } - // non-merged metadata - rec_md = record_metadata_init(se->nmem, (const char *) value, - ser_md->type, n->properties); - if (!rec_md) - { - session_log(se, YLOG_WARN, "bad metadata data '%s' " - "for element '%s'", value, type); - continue; - } - wheretoput = &record->metadata[md_field_id]; - while (*wheretoput) - wheretoput = &(*wheretoput)->next; - *wheretoput = rec_md; - // merged metadata rec_md = record_metadata_init(se->nmem, (const char *) value, ser_md->type, 0); diff --git a/src/session.h b/src/session.h index d958eaf..b839ef2 100644 --- a/src/session.h +++ b/src/session.h @@ -114,6 +114,7 @@ struct session { normalize_cache_t normalize_cache; YAZ_MUTEX session_mutex; unsigned session_id; + struct session_sorted_results *sorted_results; }; struct statistics { diff --git a/test/test_solr.urls b/test/test_solr.urls index 3daef61..4518018 100644 --- a/test/test_solr.urls +++ b/test/test_solr.urls @@ -1,5 +1,5 @@ http://localhost:9763/search.pz2?command=init&clear=1 http://localhost:9763/search.pz2?session=1&command=settings&pz%3Atermlist_term_count%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=10&use_url_proxy%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=0&pz%3Apiggyback%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1&pz%3Apreferred%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1&pz%3Acclmap%3Asu%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dsubject&pz%3Asru%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=solr&use_thumbnails%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=0&pz%3Acclmap%3Adate%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Ddate&medium%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=web&pz%3Aname%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=ocs_test&pz%3Acclmap%3Aissn%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=u%3D8&pz%3Acclmap%3Ati%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dtitle&pz%3Acclmap%3Aau%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dauthor&pz%3Axslt%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=solr-pz2.xsl&pz%3Acclmap%3Aterm%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dtext+s%3Dal&pz%3Acclmap%3Aisbn%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Disbn&pz%3Aqueryencoding%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=UTF-8 http://localhost:9763/search.pz2?session=1&command=search&query=water -1 http://localhost:9763/search.pz2?session=1&command=show&block=preferred +2 http://localhost:9763/search.pz2?session=1&command=show&block=preferred http://localhost:9763/search.pz2?session=1&command=termlist&name=xtargets%2Csubject%2Cauthor%2Cdate%2Cmedium diff --git a/test/test_url.urls b/test/test_url.urls index eab827f..b497907 100644 --- a/test/test_url.urls +++ b/test/test_url.urls @@ -5,3 +5,4 @@ http://localhost:9763/search.pz2?session=1&command=settings&pz:url%5Bmy%5D=z3950 http://localhost:9763/search.pz2?session=1&command=search&query=computer 2 http://localhost:9763/search.pz2?session=1&command=show&block=1 2 http://localhost:9763/search.pz2?session=1&command=show&block=1&sort=title:1 +1 http://localhost:9763/search.pz2?session=1&command=show&block=1&sort=title:0 diff --git a/test/test_url_8.res b/test/test_url_8.res new file mode 100644 index 0000000..11e41c6 --- /dev/null +++ b/test/test_url_8.res @@ -0,0 +1,35 @@ + +OK +0 +3 +3 +0 +3 + + +OIL/GAS DRILLING +This database contains information on oil and gas drilling such as well name, operator, driller, location, depth, copies of logs run, permits, samples (cuttings, core), completion records +OIL/GAS DRILLING +This database contains information on oil and gas drilling such as well name, operator, driller, location, depth, copies of logs run, permits, samples (cuttings, core), completion records +1907-PRESENT +title oil gas drilling author medium book + + + +GROUNDWATER RESOURCE MAPS - COUNTY SERIES +A series of 1:250,000 scale maps showing well yield, well depth, and depth to bedrock for a large number of bedrock wells inventoried by the Maine Geological Survey in the mid-to late 1970's comprises this data set. Some series also show bedrock topography and potentiometric surface. Geographic coverage is restricted to Southern Maine +GROUNDWATER RESOURCE MAPS - COUNTY SERIES +A series of 1:250,000 scale maps showing well yield, well depth, and depth to bedrock for a large number of bedrock wells inventoried by the Maine Geological Survey in the mid-to late 1970's comprises this data set. Some series also show bedrock topography and potentiometric surface. Geographic coverage is restricted to Southern Maine +1972-1978 +title groundwater resource maps county series author medium book + + + +BIBLIOGRAPHY OF MAINE GEOLOGY +This data base is a computer based bibliography of marine geology. It allows searching by topic and geographic location, similar to GEOREF. It is currently under development to replace the printed Bibliography of Marine Geology +BIBLIOGRAPHY OF MAINE GEOLOGY +This data base is a computer based bibliography of marine geology. It allows searching by topic and geographic location, similar to GEOREF. It is currently under development to replace the printed Bibliography of Marine Geology +1692-PRESENT +title bibliography of maine geology author medium book + + \ No newline at end of file