--- /dev/null
+# CCL field mappings
+# $Id: default.bib,v 1.1 2006-11-26 05:15:43 quinn Exp $
+#
+# The rule below is used when no fields are specified
+term u=1016 t=l,r s=al
+
+# Rules for some BIB-1 fields
+au u=1 s=al
+ti u=4 s=al
+su u=21 s=al
+isbn u=7
+issn u=8
+date u=30 r=r
/*
- * $Id: http.c,v 1.2 2006-11-24 20:29:07 quinn Exp $
+ * $Id: http.c,v 1.3 2006-11-26 05:15:43 quinn Exp $
*/
#include <stdio.h>
{
struct http_channel *s = iochan_getdata(i);
- yaz_log(YLOG_DEBUG, "Destroying http channel");
if (s->proxy)
{
- yaz_log(YLOG_DEBUG, "Destroying Proxy channel");
if (s->proxy->iochan)
{
close(iochan_getfd(s->proxy->iochan));
struct http_header *hp;
struct http_buf *requestbuf;
- yaz_log(YLOG_DEBUG, "Proxy request");
-
if (!p) // This is a new connection. Create a proxy channel
{
int sock;
int one = 1;
int flags;
- yaz_log(YLOG_DEBUG, "Creating a new proxy channel");
if (!(pe = getprotobyname("tcp"))) {
abort();
}
struct http_buf *htbuf;
case EVENT_INPUT:
- yaz_log(YLOG_DEBUG, "HTTP Input event");
-
htbuf = http_buf_create();
res = read(iochan_getfd(i), htbuf->buf, HTTP_BUF_SIZE -1);
if (res <= 0 && errno != EAGAIN)
{
- yaz_log(YLOG_WARN|YLOG_ERRNO, "HTTP read");
http_buf_destroy(htbuf);
http_destroy(i);
return;
}
if ((reqlen = request_check(hc->iqueue)) <= 2)
- {
- yaz_log(YLOG_DEBUG, "We don't have a complete HTTP request yet");
return;
- }
- yaz_log(YLOG_DEBUG, "We think we have a complete HTTP request (len %d)", reqlen);
nmem_reset(hc->nmem);
if (!(request = http_parse_request(hc, &hc->iqueue, reqlen)))
return;
}
http_buf_enqueue(&hc->oqueue, hb);
- yaz_log(YLOG_DEBUG, "Response ready");
iochan_setflags(i, EVENT_OUTPUT); // Turns off input selecting
}
if (hc->iqueue)
break;
case EVENT_OUTPUT:
- yaz_log(YLOG_DEBUG, "HTTP output event");
if (hc->oqueue)
{
struct http_buf *wb = hc->oqueue;
http_destroy(i);
return;
}
- yaz_log(YLOG_DEBUG, "HTTP Wrote %d octets", res);
if (res == wb->len)
{
hc->oqueue = hc->oqueue->next;
wb->offset += res;
}
if (!hc->oqueue) {
- yaz_log(YLOG_DEBUG, "Writing finished");
if (!strcmp(hc->version, "1.0"))
{
- yaz_log(YLOG_DEBUG, "Closing 1.0 connection");
http_destroy(i);
return;
}
struct http_buf *htbuf;
case EVENT_INPUT:
- yaz_log(YLOG_DEBUG, "Proxy input event");
htbuf = http_buf_create();
res = read(iochan_getfd(pi), htbuf->buf, HTTP_BUF_SIZE -1);
- yaz_log(YLOG_DEBUG, "Proxy read %d bytes.", res);
if (res == 0 || (res < 0 && errno != EINPROGRESS))
{
if (hc->oqueue)
iochan_setflag(hc->iochan, EVENT_OUTPUT);
break;
case EVENT_OUTPUT:
- yaz_log(YLOG_DEBUG, "Proxy output event");
if (!(htbuf = pc->oqueue))
{
iochan_clearflag(pi, EVENT_OUTPUT);
-/* $Id: pazpar2.c,v 1.4 2006-11-24 20:29:07 quinn Exp $ */
+/* $Id: pazpar2.c,v 1.5 2006-11-26 05:15:43 quinn Exp $ */
#include <stdlib.h>
#include <stdio.h>
#include <yaz/readconf.h>
#include <yaz/pquery.h>
#include <yaz/yaz-util.h>
+#include <yaz/ccl.h>
#include "pazpar2.h"
#include "eventl.h"
struct timeval base_time;
int toget;
int chunk;
- void *ccl_filter;
+ CCL_bibset ccl_filter;
} global_parameters =
{
30,
}
}
+static void pull_relevance_keys(struct session *s, struct record *head, struct record *rec)
+{
+ relevance_newrec(s->relevance, head);
+ relevance_countwords(s->relevance, head, rec->merge_key, strlen(rec->merge_key));
+ relevance_donerecord(s->relevance, head);
+}
+
struct record *ingest_record(struct target *t, char *buf, int len)
{
struct session *s = t->session;
struct record *res;
+ struct record *head;
const char *recbuf;
wrbuf_rewind(s->wrbuf);
res->target = t;
res->next_cluster = 0;
res->target_offset = -1;
+ res->term_frequency_vec = 0;
- yaz_log(YLOG_DEBUG, "Key: %s", res->merge_key);
+ head = reclist_insert(s->reclist, res);
- reclist_insert(s->reclist, res);
+ pull_relevance_keys(s, head, res);
return res;
}
rec = ingest_record(t, buf, len);
if (!rec)
continue;
- yaz_log(YLOG_DEBUG, "Ingested a fooking record");
}
}
t->diagnostic = *recs->u.nonSurrogateDiagnostic->condition;
t->state = Error;
}
- else
- {
- yaz_log(YLOG_DEBUG, "Got Records!");
- }
}
if (!*r->presentStatus && t->state != Error)
t->state = Failed;
return;
}
- yaz_log(YLOG_DEBUG, "Successfully decoded %d oct PDU", len);
switch (a->which)
{
case Z_APDU_initResponse:
}
if (live_channels)
{
- const char *t[] = { "aa", "ab", 0 };
+ const char *p[] = { query, 0 };
int maxrecs = live_channels * global_parameters.toget;
s->termlist = termlist_create(s->nmem, maxrecs, 15);
s->reclist = reclist_create(s->nmem, maxrecs);
- relevance_create(s->nmem, t, 1000);
+ s->relevance = relevance_create(s->nmem, p, maxrecs);
}
}
// FIXME -- skip initial records
- reclist_rewind(s->reclist);
+ relevance_prepare_read(s->relevance, s->reclist);
for (i = 0; i < *num; i++)
{
- recs[i] = reclist_read_record(s->reclist);
- if (!recs[i])
+ struct record *r = reclist_read_record(s->reclist);
+ if (!r)
{
*num = i;
break;
}
+ recs[i] = r;
+ yaz_log(YLOG_DEBUG, "%d: %s%s", r->relevance, r->merge_key, r->next_cluster ? " (cluster)": "");
}
return recs;
}
stat->num_connections = i;
}
-static void *load_cclfile(const char *fn)
+static CCL_bibset load_cclfile(const char *fn)
{
- return 0;
+ CCL_bibset res = ccl_qual_mk();
+ if (ccl_qual_fname(res, fn) < 0)
+ {
+ yaz_log(YLOG_FATAL|YLOG_ERRNO, "%s", fn);
+ exit(1);
+ }
+ return res;
}
int main(int argc, char **argv)
}
+ if (!global_parameters.ccl_filter)
+ load_cclfile("default.bib");
+
event_loop(&channel_list);
return 0;
#ifndef PAZPAR2_H
#define PAZPAR2_H
+struct record;
+
#include <yaz/pquery.h>
#include "termlists.h"
+#include "relevance.h"
struct record {
struct target *target;
int target_offset;
char *buf;
char *merge_key;
+ int relevance;
+ int *term_frequency_vec;
struct record *next_cluster;
};
NMEM nmem;
WRBUF wrbuf;
struct termlist *termlist;
+ struct relevance *relevance;
struct reclist *reclist;
yaz_marc_t yaz_marc;
};
/*
- * $Id: reclists.c,v 1.1 2006-11-24 20:29:07 quinn Exp $
+ * $Id: reclists.c,v 1.2 2006-11-26 05:15:43 quinn Exp $
*/
#include <assert.h>
struct reclist_bucket *next;
};
-struct reclist
-{
- struct reclist_bucket **hashtable;
- int hashtable_size;
- int hashmask;
-
- struct record **flatlist;
- int flatlist_size;
- int num_records;
- int pointer;
-
- NMEM nmem;
-};
-
struct record *reclist_read_record(struct reclist *l)
{
if (l->pointer < l->num_records)
return res;
}
-void reclist_insert(struct reclist *l, struct record *record)
+struct record *reclist_insert(struct reclist *l, struct record *record)
{
unsigned int bucket;
struct reclist_bucket **p;
+ struct record *head;
bucket = hash(record->merge_key) & l->hashmask;
for (p = &l->hashtable[bucket]; *p; p = &(*p)->next)
yaz_log(YLOG_LOG, "Found a matching record: %s", record->merge_key);
record->next_cluster = existing->next_cluster;
existing->next_cluster = record;
+ head = existing;
break;
}
}
new->next = 0;
*p = new;
l->flatlist[l->num_records++] = record;
+ head = record;
}
+ return head;
}
#ifndef RECLISTS_H
#define RECLISTS_H
-struct reclist;
+struct reclist
+{
+ struct reclist_bucket **hashtable;
+ int hashtable_size;
+ int hashmask;
+
+ struct record **flatlist;
+ int flatlist_size;
+ int num_records;
+ int pointer;
+
+ NMEM nmem;
+};
struct reclist *reclist_create(NMEM, int numrecs);
-void reclist_insert(struct reclist *tl, struct record *record);
+struct record * reclist_insert(struct reclist *tl, struct record *record);
struct record *reclist_read_record(struct reclist *l);
void reclist_rewind(struct reclist *l);
/*
- * $Id: relevance.c,v 1.1 2006-11-24 20:29:07 quinn Exp $
+ * $Id: relevance.c,v 1.2 2006-11-26 05:15:43 quinn Exp $
*/
#include <ctype.h>
+#include <math.h>
+#include <stdlib.h>
#include "relevance.h"
#include "pazpar2.h"
struct relevance
{
- struct relevance_record *records;
- int num_records;
int *doc_frequency_vec;
int vec_len;
struct word_trie *wt;
NMEM nmem;
};
-struct relevance_record
-{
- struct record *record;
- int *term_frequency_vec;
-};
-
// We use this data structure to recognize terms in input records,
// and map them to record term vectors for counting.
struct word_trie
else
{
c -= 'a';
- if (!n->list[c].child)
- {
- struct word_trie *new = create_word_trie_node(nmem);
- n->list[c].child = new;
- }
if (!*(++term))
n->list[c].termno = num;
else
+ {
+ if (!n->list[c].child)
+ {
+ struct word_trie *new = create_word_trie_node(nmem);
+ n->list[c].child = new;
+ }
word_trie_addterm(nmem, n->list[c].child, term, num);
+ }
break;
}
}
+}
+
+#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
+
+static int word_trie_match(struct word_trie *t, const char *word, int len, int *skipped)
+{
+ int c = raw_char(tolower(*word));
+
+ if (!len)
+ return 0;
+
+ word++; len--;
+ (*skipped)++;
+ if (!len || raw_char(*word) < 0)
+ {
+ if (t->list[c].termno > 0)
+ return t->list[c].termno;
+ else
+ return 0;
+ }
+ else
+ {
+ if (t->list[c].child)
+ {
+ return word_trie_match(t->list[c].child, word, len, skipped);
+ }
+ else
+ return 0;
+ }
}
+
static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
{
struct word_trie *res = create_word_trie_node(nmem);
res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
bzero(res->doc_frequency_vec, res->vec_len * sizeof(int));
res->nmem = nmem;
- res->num_records = 0;
- res->records = nmem_malloc(nmem, numrecs * sizeof(struct relevance_record *));
res->wt = build_word_trie(nmem, terms);
return res;
}
-struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec)
+void relevance_newrec(struct relevance *r, struct record *rec)
{
- struct relevance_record *res = nmem_malloc(r->nmem,
- sizeof(struct relevance_record));
- res->record = rec;
- res->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
- bzero(res->term_frequency_vec, r->vec_len * sizeof(int));
- return res;
+ if (!rec->term_frequency_vec)
+ {
+ rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
+ bzero(rec->term_frequency_vec, r->vec_len * sizeof(int));
+ }
}
-void relevance_countwords(struct relevance_record *rec, const char *words, int len)
+
+// FIXME. The definition of a word is crude here.. should support
+// some form of localization mechanism?
+void relevance_countwords(struct relevance *r, struct record *head,
+ const char *words, int len)
{
+ while (len)
+ {
+ char c;
+ int res;
+ int skipped;
+ while (len && (c = raw_char(tolower(*words))) < 0)
+ {
+ words++;
+ len--;
+ }
+ if (!len)
+ return;
+ skipped = 0;
+ if ((res = word_trie_match(r->wt, words, len, &skipped)))
+ {
+ words += skipped;
+ len -= skipped;
+ head->term_frequency_vec[res]++;
+ }
+ else
+ {
+ while (len && (c = raw_char(tolower(*words))) >= 0)
+ {
+ words++;
+ len--;
+ }
+ }
+ head->term_frequency_vec[0]++;
+ }
}
-void relevance_donerecord(struct relevance_record *rec)
+void relevance_donerecord(struct relevance *r, struct record *head)
{
+ int i;
+
+ for (i = 1; i < r->vec_len; i++)
+ if (head->term_frequency_vec[i] > 0)
+ r->doc_frequency_vec[i]++;
+
+ r->doc_frequency_vec[0]++;
}
-// Prepare for a relevance-sorted read of up to num entries
-void relevance_prepare_read(struct relevance *r, int num)
+#ifdef FLOAT_REL
+static int comp(const void *p1, const void *p2)
+{
+ float res;
+ struct record **r1 = (struct record **) p1;
+ struct record **r2 = (struct record **) p2;
+ res = (*r2)->relevance - (*r1)->relevance;
+ if (res > 0)
+ return 1;
+ else if (res < 0)
+ return -1;
+ else
+ return 0;
+}
+#else
+static int comp(const void *p1, const void *p2)
{
+ struct record **r1 = (struct record **) p1;
+ struct record **r2 = (struct record **) p2;
+ return (*r2)->relevance - (*r1)->relevance;
}
+#endif
-struct record *relevance_read(struct relevance *r)
+// Prepare for a relevance-sorted read of up to num entries
+void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
{
- return 0;
+ int i;
+ float *idfvec = xmalloc(rel->vec_len * sizeof(float));
+
+ // Calculate document frequency vector for each term.
+ for (i = 1; i < rel->vec_len; i++)
+ {
+ if (!rel->doc_frequency_vec[i])
+ idfvec[i] = 0;
+ else
+ idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]);
+ }
+ // Calculate relevance for each document
+ for (i = 0; i < reclist->num_records; i++)
+ {
+ int t;
+ struct record *rec = reclist->flatlist[i];
+ float relevance;
+ relevance = 0;
+ for (t = 1; t < rel->vec_len; t++)
+ {
+ float termfreq;
+ if (!rec->term_frequency_vec[0])
+ break;
+ termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
+ relevance += termfreq * idfvec[t];
+ }
+ rec->relevance = (int) (relevance * 100000);
+ }
+ qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
+ reclist->pointer = 0;
}
/*
#include <yaz/yaz-util.h>
#include "pazpar2.h"
+#include "reclists.h"
struct relevance;
-struct relevance_record;
struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs);
-struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec);
-void relevance_countwords(struct relevance_record *rec, const char *words, int len);
-void relevance_donerecord(struct relevance_record *rec);
+void relevance_newrec(struct relevance *r, struct record *rec);
+void relevance_countwords(struct relevance *r, struct record *rec,
+ const char *words, int len);
+void relevance_donerecord(struct relevance *r, struct record *rec);
-void relevance_prepare_read(struct relevance *r, int num);
-struct record *relevance_read(struct relevance *r);
+void relevance_prepare_read(struct relevance *rel, struct reclist *rec);
#endif