dnl Zebra, Index Data Aps, 1995-2004
-dnl $Id: configure.in,v 1.91.2.2 2004-08-20 11:07:32 adam Exp $
+dnl $Id: configure.in,v 1.91.2.3 2004-09-16 14:07:48 adam Exp $
dnl
AC_INIT(include/zebraver.h)
AM_INIT_AUTOMAKE(idzebra,1.3.18)
doc/tkl.xsl
test/Makefile test/gils/Makefile test/usmarc/Makefile test/api/Makefile
test/rusmarc/Makefile test/cddb/Makefile test/malxml/Makefile
- test/config/Makefile
+ test/config/Makefile test/sort2/Makefile
perl/Makefile.PL test/xelm/Makefile
test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile
test/marcxml/Makefile test/charmap/Makefile test/codec/Makefile
<chapter id="record-model">
- <!-- $Id: recordmodel.xml,v 1.18 2004-08-04 08:26:43 adam Exp $ -->
+ <!-- $Id: recordmodel.xml,v 1.17.2.1 2004-09-16 14:07:49 adam Exp $ -->
<title>The Record Model</title>
<para>
special-purpose fields such as WWW-style linkages (URx).
</para>
- <para>
- The field types, and hence character sets, are associated with data
- elements by the .abs files (see above).
- The file <literal>default.idx</literal>
- provides the association between field type codes (as used in the .abs
- files) and the character map files (with the .chr suffix). The format
- of the .idx file is as follows
- </para>
-
- <para>
- <variablelist>
-
- <varlistentry>
- <term>index <emphasis>field type code</emphasis></term>
- <listitem>
- <para>
- This directive introduces a new search index code.
- The argument is a one-character code to be used in the
- .abs files to select this particular index type. An index, roughly,
- corresponds to a particular structure attribute during search. Refer
- to <xref linkend="search"/>.
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>sort <emphasis>field code type</emphasis></term>
- <listitem>
- <para>
- This directive introduces a
- sort index. The argument is a one-character code to be used in the
- .abs fie to select this particular index type. The corresponding
- use attribute must be used in the sort request to refer to this
- particular sort index. The corresponding character map (see below)
- is used in the sort process.
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>completeness <emphasis>boolean</emphasis></term>
- <listitem>
- <para>
- This directive enables or disables complete field indexing.
- The value of the <emphasis>boolean</emphasis> should be 0
- (disable) or 1. If completeness is enabled, the index entry will
- contain the complete contents of the field (up to a limit), with words
- (non-space characters) separated by single space characters
- (normalized to " " on display). When completeness is
- disabled, each word is indexed as a separate entry. Complete subfield
- indexing is most useful for fields which are typically browsed (eg.
- titles, authors, or subjects), or instances where a match on a
- complete subfield is essential (eg. exact title searching). For fields
- where completeness is disabled, the search engine will interpret a
- search containing space characters as a word proximity search.
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>charmap <emphasis>filename</emphasis></term>
- <listitem>
- <para>
- This is the filename of the character
- map to be used for this index for field type.
- </para>
- </listitem></varlistentry>
- </variablelist>
- </para>
-
- <para>
- The contents of the character map files are structured as follows:
- </para>
-
- <para>
- <variablelist>
-
- <varlistentry>
- <term>lowercase <emphasis>value-set</emphasis></term>
- <listitem>
- <para>
- This directive introduces the basic value set of the field type.
- The format is an ordered list (without spaces) of the
- characters which may occur in "words" of the given type.
- The order of the entries in the list determines the
- sort order of the index. In addition to single characters, the
- following combinations are legal:
- </para>
-
- <para>
-
- <itemizedlist>
- <listitem>
- <para>
- Backslashes may be used to introduce three-digit octal, or
- two-digit hex representations of single characters
- (preceded by <literal>x</literal>).
- In addition, the combinations
- \\, \\r, \\n, \\t, \\s (space — remember that real
- space-characters may not occur in the value definition), and
- \\ are recognized, with their usual interpretation.
- </para>
- </listitem>
-
- <listitem>
- <para>
- Curly braces {} may be used to enclose ranges of single
- characters (possibly using the escape convention described in the
- preceding point), eg. {a-z} to introduce the
- standard range of ASCII characters.
- Note that the interpretation of such a range depends on
- the concrete representation in your local, physical character set.
- </para>
- </listitem>
-
- <listitem>
- <para>
- paranthesises () may be used to enclose multi-byte characters -
- eg. diacritics or special national combinations (eg. Spanish
- "ll"). When found in the input stream (or a search term),
- these characters are viewed and sorted as a single character, with a
- sorting value depending on the position of the group in the value
- statement.
- </para>
- </listitem>
+ <sect3 id="default-idx-file">
+ <title>The default.idx file</title>
+ <para>
+ The field types, and hence character sets, are associated with data
+ elements by the .abs files (see above).
+ The file <literal>default.idx</literal>
+ provides the association between field type codes (as used in the .abs
+ files) and the character map files (with the .chr suffix). The format
+ of the .idx file is as follows
+ </para>
- </itemizedlist>
+ <para>
+ <variablelist>
+
+ <varlistentry>
+ <term>index <emphasis>field type code</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces a new search index code.
+ The argument is a one-character code to be used in the
+ .abs files to select this particular index type. An index, roughly,
+ corresponds to a particular structure attribute during search. Refer
+ to <xref linkend="search"/>.
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>sort <emphasis>field code type</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces a
+ sort index. The argument is a one-character code to be used in the
+ .abs fie to select this particular index type. The corresponding
+ use attribute must be used in the sort request to refer to this
+ particular sort index. The corresponding character map (see below)
+ is used in the sort process.
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>completeness <emphasis>boolean</emphasis></term>
+ <listitem>
+ <para>
+ This directive enables or disables complete field indexing.
+ The value of the <emphasis>boolean</emphasis> should be 0
+ (disable) or 1. If completeness is enabled, the index entry will
+ contain the complete contents of the field (up to a limit), with words
+ (non-space characters) separated by single space characters
+ (normalized to " " on display). When completeness is
+ disabled, each word is indexed as a separate entry. Complete subfield
+ indexing is most useful for fields which are typically browsed (eg.
+ titles, authors, or subjects), or instances where a match on a
+ complete subfield is essential (eg. exact title searching). For fields
+ where completeness is disabled, the search engine will interpret a
+ search containing space characters as a word proximity search.
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>charmap <emphasis>filename</emphasis></term>
+ <listitem>
+ <para>
+ This is the filename of the character
+ map to be used for this index for field type.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+ </para>
+ </sect3>
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>uppercase <emphasis>value-set</emphasis></term>
- <listitem>
- <para>
- This directive introduces the
- upper-case equivalencis to the value set (if any). The number and
- order of the entries in the list should be the same as in the
- <literal>lowercase</literal> directive.
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>space <emphasis>value-set</emphasis></term>
- <listitem>
- <para>
- This directive introduces the character
- which separate words in the input stream. Depending on the
- completeness mode of the field in question, these characters either
- terminate an index entry, or delimit individual "words" in
- the input stream. The order of the elements is not significant —
- otherwise the representation is the same as for the
- <literal>uppercase</literal> and <literal>lowercase</literal>
- directives.
- </para>
- </listitem></varlistentry>
- <varlistentry>
- <term>map <emphasis>value-set</emphasis>
- <emphasis>target</emphasis></term>
- <listitem>
- <para>
- This directive introduces a
- mapping between each of the members of the value-set on the left to
- the character on the right. The character on the right must occur in
- the value set (the <literal>lowercase</literal> directive) of
- the character set, but
- it may be a paranthesis-enclosed multi-octet character. This directive
- may be used to map diacritics to their base characters, or to map
- HTML-style character-representations to their natural form, etc.
- </para>
- </listitem></varlistentry>
- </variablelist>
- </para>
+ <sect3 id="character-map-files">
+ <title>The character map file format</title>
+ <para>
+ The contents of the character map files are structured as follows:
+ </para>
+ <para>
+ <variablelist>
+
+ <varlistentry>
+ <term>lowercase <emphasis>value-set</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces the basic value set of the field type.
+ The format is an ordered list (without spaces) of the
+ characters which may occur in "words" of the given type.
+ The order of the entries in the list determines the
+ sort order of the index. In addition to single characters, the
+ following combinations are legal:
+ </para>
+
+ <para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ Backslashes may be used to introduce three-digit octal, or
+ two-digit hex representations of single characters
+ (preceded by <literal>x</literal>).
+ In addition, the combinations
+ \\, \\r, \\n, \\t, \\s (space — remember that real
+ space-characters may not occur in the value definition), and
+ \\ are recognized, with their usual interpretation.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Curly braces {} may be used to enclose ranges of single
+ characters (possibly using the escape convention described in the
+ preceding point), eg. {a-z} to introduce the
+ standard range of ASCII characters.
+ Note that the interpretation of such a range depends on
+ the concrete representation in your local, physical character set.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ paranthesises () may be used to enclose multi-byte characters -
+ eg. diacritics or special national combinations (eg. Spanish
+ "ll"). When found in the input stream (or a search term),
+ these characters are viewed and sorted as a single character, with a
+ sorting value depending on the position of the group in the value
+ statement.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>uppercase <emphasis>value-set</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces the
+ upper-case equivalencis to the value set (if any). The number and
+ order of the entries in the list should be the same as in the
+ <literal>lowercase</literal> directive.
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>space <emphasis>value-set</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces the character
+ which separate words in the input stream. Depending on the
+ completeness mode of the field in question, these characters either
+ terminate an index entry, or delimit individual "words" in
+ the input stream. The order of the elements is not significant —
+ otherwise the representation is the same as for the
+ <literal>uppercase</literal> and <literal>lowercase</literal>
+ directives.
+ </para>
+ </listitem></varlistentry>
+ <varlistentry>
+ <term>map <emphasis>value-set</emphasis>
+ <emphasis>target</emphasis></term>
+ <listitem>
+ <para>
+ This directive introduces a
+ mapping between each of the members of the value-set on the left to
+ the character on the right. The character on the right must occur in
+ the value set (the <literal>lowercase</literal> directive) of
+ the character set, but
+ it may be a paranthesis-enclosed multi-octet character. This directive
+ may be used to map diacritics to their base characters, or to map
+ HTML-style character-representations to their natural form, etc. The map directive
+ can also be used to ignore leading articles in searching and/or sorting, and to perform
+ other special transformations. See section <xref linkend="leading-articles"/>.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+ </para>
+ </sect3>
+ <sect3 id="leading-articles">
+ <title>Ignoring leading articles</title>
+ <para>
+ In addition to specifying sort orders, space (blank) handling, and upper/lowercase folding,
+ you can also use the character map files to make Zebra ignore leading articles in sorting
+ records, or when doing complete field searching.
+ </para>
+ <para>
+ This is done using the <literal>map</literal> directive in the character map file. In a
+ nutshell, what you do is map certain sequences of characters, when they occur <emphasis>
+ in the beginning of a field</emphasis>, to a space. Assuming that the character "@" is
+ defined as a space character in your file, you can do:
+ <screen>
+ map (^The\s) @
+ map (^the\s) @
+ </screen>
+ The effect of these directives is to map either 'the' or 'The', followed by a space
+ character, to a space. The hat ^ character denotes beginning-of-field only when
+ complete-subfield indexing or sort indexing is taking place; otherwise, it is treated just
+ as any other character.
+ </para>
+ <para>
+ Because the <literal>default.idx</literal> file can be used to associate different
+ character maps with different indexing types -- and you can create additional indexing
+ types, should the need arise -- it is possible to specify that leading articles should be
+ ignored either in sorting, in complete-field searching, or both.
+ </para>
+ <para>
+ If you ignore certain prefixes in sorting, then these will be eliminated from the index,
+ and sorting will take place as if they weren't there. However, if you set the system up
+ to ignore certain prefixes in <emphasis>searching</emphasis>, then these are deleted both
+ from the indexes and from query terms, when the client specifies complete-field
+ searching. This has the effect that a search for 'the science journal' and 'science
+ journal' would both produce the same results.
+ </para>
+ </sect3>
</sect2>
-
</sect1>
<sect1 id="formats">
-/* $Id: charmap.h,v 1.9 2004-07-28 09:47:41 adam Exp $
+/* $Id: charmap.h,v 1.9.2.1 2004-09-16 14:07:49 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
int map_only, const char *tabroot);
YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab);
-YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len);
+YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len, int first);
YAZ_EXPORT const char **chr_map_input_x(chrmaptab t,
- const char **from, int *len);
+ const char **from, int *len, int first);
YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab,
const char **from, int len,
const char **qmap);
-/* $Id: zebramap.h,v 1.15 2004-07-28 09:47:41 adam Exp $
+/* $Id: zebramap.h,v 1.15.2.1 2004-09-16 14:07:49 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
void zebra_maps_close (ZebraMaps zm);
const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id,
- const char **from, int len);
+ const char **from, int len, int first);
const char *zebra_maps_output(ZebraMaps, unsigned reg_id, const char **from);
int zebra_maps_attr (ZebraMaps zms, Z_AttributesPlusTerm *zapt,
-/* $Id: extract.c,v 1.158 2004-08-04 08:35:23 adam Exp $
+/* $Id: extract.c,v 1.157.2.1 2004-09-16 14:07:50 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
02111-1307, USA.
*/
+
#include <stdio.h>
#include <assert.h>
#include <ctype.h>
struct recKeys *reckeys,
int attrSetS, int attrUseS)
{
-#if IT_KEY_NEW
-/* #error searchRecordKey does not work yet in this mode.. */
- static const char *ws[32];
- void *decode_handle = iscz1_start();
- int off = 0;
- int startSeq = -1;
- int seqno = 0;
- int i;
-
- for (i = 0; i<32; i++)
- ws[i] = NULL;
-
- while (off < reckeys->buf_used)
- {
- const char *src = reckeys->buf + off;
- struct it_key key;
- char *dst = (char*) &key;
- int attrSet, attrUse;
-
- iscz1_decode(decode_handle, &dst, &src);
- assert(key.len < 4 && key.len > 2);
-
- attrSet = key.mem[0];
- attrUse = key.mem[1];
- seqno = key.mem[2];
-
- if (attrUseS == attrUse && attrSetS == attrSet)
- {
- int woff;
-
- if (startSeq == -1)
- startSeq = seqno;
- woff = seqno - startSeq;
- if (woff >= 0 && woff < 31)
- ws[woff] = src;
- }
-
- while (*src++)
- ;
- off = src - reckeys->buf;
- }
- iscz1_stop(decode_handle);
- assert (off == reckeys->buf_used);
- return ws;
-#else
static const char *ws[32];
int off = 0;
int startSeq = -1;
}
assert (off == reckeys->buf_used);
return ws;
-#endif
}
struct file_read_info {
int recordOffset;
struct recordGroup *rGroup;
};
-
-void create_rec_keys_codec(struct recKeys *keys)
-{
- keys->buf_used = 0;
-#if IT_KEY_NEW
- iscz1_reset(keys->codec_handle);
-#else
- keys->prevAttrUse = -1;
- keys->prevAttrSet = -1;
- keys->prevSeqNo = 0;
-#endif
-}
static int file_extract_record(ZebraHandle zh,
SYSNO *sysno, const char *fname,
/* we are going to read from a file, so prepare the extraction */
int i;
- create_rec_keys_codec(&zh->reg->keys);
-
+ zh->reg->keys.buf_used = 0;
+ zh->reg->keys.prevAttrUse = -1;
+ zh->reg->keys.prevAttrSet = -1;
+ zh->reg->keys.prevSeqNo = 0;
zh->reg->sortKeys.buf_used = 0;
recordOffset = fi->file_moffset;
{
rinfo = dict_lookup (zh->reg->matchDict, matchStr);
if (rinfo)
- {
- assert(*rinfo == sizeof(*sysno));
memcpy (sysno, rinfo+1, sizeof(*sysno));
- }
}
else
{
int delete_flag,
int test_mode,
const char *recordType,
- SYSNO *sysno,
+ int *sysno,
const char *match_criteria,
const char *fname,
int force_update,
extractCtrl.endf = zebra_record_int_end;
extractCtrl.fh = &fc;
- create_rec_keys_codec(&zh->reg->keys);
-
+ zh->reg->keys.buf_used = 0;
+ zh->reg->keys.prevAttrUse = -1;
+ zh->reg->keys.prevAttrSet = -1;
+ zh->reg->keys.prevSeqNo = 0;
zh->reg->sortKeys.buf_used = 0;
if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
if (matchStr) {
rinfo = dict_lookup (zh->reg->matchDict, matchStr);
if (rinfo)
- {
- assert(*rinfo == sizeof(*sysno));
memcpy (sysno, rinfo+1, sizeof(*sysno));
- }
}
}
abort ();
}
- create_rec_keys_codec(&zh->reg->keys);
-
+ zh->reg->keys.buf_used = 0;
+ zh->reg->keys.prevAttrUse = -1;
+ zh->reg->keys.prevAttrSet = -1;
+ zh->reg->keys.prevSeqNo = 0;
zh->reg->sortKeys.buf_used = 0;
extractCtrl.init = extract_init;
void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
int cmd, struct recKeys *reckeys)
{
-#if IT_KEY_NEW
- void *decode_handle = iscz1_start();
-#else
- int seqno = 0;
#if SU_SCHEME
#else
unsigned char attrSet = (unsigned char) -1;
unsigned short attrUse = (unsigned short) -1;
#endif
-#endif
+ int seqno = 0;
int off = 0;
int ch = 0;
ZebraExplainInfo zei = zh->reg->zei;
zh->reg->key_file_no = 0;
}
zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
-#if IT_KEY_NEW
- while (off < reckeys->buf_used)
- {
- const char *src = reckeys->buf + off;
- struct it_key key;
- char *dst = (char*) &key;
- int attrSet, attrUse;
-
- iscz1_decode(decode_handle, &dst, &src);
- assert(key.len < 4 && key.len > 2);
-
- attrSet = key.mem[0];
- attrUse = key.mem[1]; /* sequence in mem[2] */
-
- if (zh->reg->key_buf_used + 1024 >
- (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))
- extract_flushWriteKeys (zh,0);
- assert(zh->reg->ptr_i >= 0);
- ++(zh->reg->ptr_i);
- assert(zh->reg->ptr_i > 0);
- (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =
- (char*)zh->reg->key_buf + zh->reg->key_buf_used;
-
- ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
- if (ch < 0)
- ch = zebraExplain_addSU (zei, attrSet, attrUse);
-
- assert (ch > 0);
- zh->reg->key_buf_used +=
- key_SU_encode (ch,((char*)zh->reg->key_buf) +
- zh->reg->key_buf_used);
- while (*src)
- ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;
- src++;
- ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';
- ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;
-
- key.len = 2;
- key.mem[0] = sysno;
- key.mem[1] = key.mem[2]; /* sequence .. */
-
- memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used,
- &key, sizeof(key));
- (zh->reg->key_buf_used) += sizeof(key);
- off = src - reckeys->buf;
- }
-#else
while (off < reckeys->buf_used)
{
const char *src = reckeys->buf + off;
(zh->reg->key_buf_used) += sizeof(key);
off = src - reckeys->buf;
}
-#endif
assert (off == reckeys->buf_used);
-#if IT_KEY_NEW
- iscz1_stop(decode_handle);
-#endif
}
void extract_flushWriteKeys (ZebraHandle zh, int final)
zh->reg->key_buf_used = 0;
}
-void extract_add_index_string (RecWord *p, const char *str, int length)
+void extract_add_index_string (RecWord *p, const char *string,
+ int length)
{
char *dst;
- ZebraHandle zh = p->extractCtrl->handle;
- struct recKeys *keys = &zh->reg->keys;
-#if IT_KEY_NEW
- struct it_key key;
- const char *src = (char*) &key;
-#else
unsigned char attrSet;
unsigned short attrUse;
int lead = 0;
int diff = 0;
int *pseqno = &p->seqno;
+ ZebraHandle zh = p->extractCtrl->handle;
ZebraExplainInfo zei = zh->reg->zei;
-#endif
+ struct recKeys *keys = &zh->reg->keys;
if (keys->buf_used+1024 > keys->buf_max)
{
}
dst = keys->buf + keys->buf_used;
-#if IT_KEY_NEW
- key.len = 3;
- key.mem[0] = p->attrSet;
- key.mem[1] = p->attrUse;
- key.mem[2] = p->seqno;
-
-#if 0
- /* just for debugging .. */
- yaz_log(LOG_LOG, "set=%d use=%d seqno=%d", p->attrSet, p->attrUse,
- p->seqno);
-#endif
-
- iscz1_encode(keys->codec_handle, &dst, &src);
-
- *dst++ = p->reg_type;
- memcpy (dst, str, length);
- dst += length;
- *dst++ = '\0';
-#else
/* leader byte is encoded as follows:
bit 0 : 1 if attrset is unchanged; 0 if attrset is changed
bit 1 : 1 if attruse is unchanged; 0 if attruse is changed
lead |= 2;
else
keys->prevAttrUse = attrUse;
-
+#if 1
diff = 1 + *pseqno - keys->prevSeqNo;
if (diff >= 1 && diff <= 15)
lead |= (diff << 2);
else
diff = 0;
-
+#endif
keys->prevSeqNo = *pseqno;
*dst++ = lead;
}
#endif
*dst++ = p->reg_type;
- memcpy (dst, str, length);
+ memcpy (dst, string, length);
dst += length;
*dst++ = '\0';
memcpy (dst, pseqno, sizeof(*pseqno));
dst += sizeof(*pseqno);
}
-#endif
keys->buf_used = dst - keys->buf;
}
-static void extract_add_sort_string (RecWord *p, const char *str,
+static void extract_add_sort_string (RecWord *p, const char *string,
int length)
{
ZebraHandle zh = p->extractCtrl->handle;
off += key_SU_encode(p->attrSet, sk->buf + off);
off += key_SU_encode(p->attrUse, sk->buf + off);
off += key_SU_encode(length, sk->buf + off);
- memcpy (sk->buf + off, str, length);
+ memcpy (sk->buf + off, string, length);
sk->buf_used = off + length;
}
const char **map = 0;
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
while (map)
{
{
remain = p->length - (b - p->string);
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
else
map = 0;
}
buf[i++] = *(cp++);
remain = p->length - (b - p->string);
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
else
map = 0;
}
char buf[IT_MAX_WORD+1];
const char **map = 0;
int i = 0, remain = p->length;
+ int first; /* first position */
+
+yaz_log(LOG_DEBUG, "Complete field, w='%s'", p->string);
if (remain > 0)
- map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain, 1);
while (remain > 0 && i < IT_MAX_WORD)
{
remain = p->length - (b - p->string);
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ {
+ first = i ? 0 : 1;
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, first);
+ }
else
map = 0;
}
{
if (i >= IT_MAX_WORD)
break;
+yaz_log(LOG_DEBUG, "Adding string to index '%d'", *map);
while (i < IT_MAX_WORD && *cp)
buf[i++] = *(cp++);
}
remain = p->length - (b - p->string);
if (remain > 0)
+ {
map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
- remain);
+ remain, 0);
+ }
else
map = 0;
}
i->prevseq=0;
i->prevcmd=-1;
i->keylen=0;
-#if IT_KEY_NEW
- i->encode_handle = iscz1_start();
-#endif
}
-#if IT_KEY_NEW
-#else
char *encode_key_int (int d, char *bp)
{
if (d <= 63)
}
return bp;
}
-#endif
-
#define OLDENCODE 1
#ifdef OLDENCODE
void encode_key_write (char *k, struct encode_info *i, FILE *outf)
{
struct it_key key;
- char *bp = i->buf, *bp0;
- const char *src = (char *) &key;
+ char *bp = i->buf;
- /* copy term to output buf */
while ((*bp++ = *k++))
;
- /* and copy & align key so we can mangle */
- memcpy (&key, k+1, sizeof(struct it_key)); /* *k is insert/delete */
-#if IT_KEY_NEW
- bp0 = bp++;
- iscz1_encode(i->encode_handle, &bp, &src);
- *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */
-#else
+ memcpy (&key, k+1, sizeof(struct it_key));
bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
if (i->sysno != key.sysno)
{
bp = encode_key_int (key.seqno - i->seqno, bp);
i->seqno = key.seqno;
i->cmd = *k;
-#endif
if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
{
logf (LOG_FATAL|LOG_ERRNO, "fwrite");
void encode_key_flush (struct encode_info *i, FILE *outf)
{ /* dummy routine */
-#if IT_KEY_NEW
- iscz1_stop(i->encode_handle);
-#endif
}
#else
-/* $Id: zrpn.c,v 1.142 2004-08-04 08:35:23 adam Exp $
+/* $Id: zrpn.c,v 1.141.2.1 2004-09-16 14:07:50 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
static const char **rpn_char_map_handler (void *vp, const char **from, int len)
{
struct rpn_char_map_info *p = (struct rpn_char_map_info *) vp;
- const char **out = zebra_maps_input (p->zm, p->reg_type, from, len);
+ const char **out = zebra_maps_input (p->zm, p->reg_type, from, len, 0);
#if 0
if (out && *out)
{
#ifdef TERM_COUNT
int *term_no;
#endif
- ISAMC_P *isam_p_buf;
+ ISAMS_P *isam_p_buf;
int isam_p_size;
int isam_p_indx;
ZebraHandle zh;
{
if (p->isam_p_indx == p->isam_p_size)
{
- ISAMC_P *new_isam_p_buf;
+ ISAMS_P *new_isam_p_buf;
#ifdef TERM_COUNT
int *new_term_no;
#endif
p->isam_p_size = 2*p->isam_p_size + 100;
- new_isam_p_buf = (ISAMC_P *) xmalloc (sizeof(*new_isam_p_buf) *
+ new_isam_p_buf = (ISAMS_P *) xmalloc (sizeof(*new_isam_p_buf) *
p->isam_p_size);
if (p->isam_p_buf)
{
}
static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src,
- const char *ct1, const char *ct2)
+ const char *ct1, const char *ct2, int first)
{
const char *s1, *s0 = *src;
const char **map;
if (ct2 && strchr (ct2, *s0))
break;
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1));
+ map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1), first);
if (**map != *CHR_SPACE)
break;
s0 = s1;
const char *space_start = 0;
const char *space_end = 0;
- if (!term_pre (zebra_maps, reg_type, src, NULL, NULL))
+ if (!term_pre (zebra_maps, reg_type, src, NULL, NULL, !space_split))
return 0;
s0 = *src;
while (*s0)
{
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
if (space_split)
{
if (**map == *CHR_SPACE)
int i = 0;
int j = 0;
- if (!term_pre (zebra_maps, reg_type, src, "#", "#"))
+ if (!term_pre (zebra_maps, reg_type, src, "#", "#", !space_split))
return 0;
s0 = *src;
while (*s0)
else
{
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
if (space_split && **map == *CHR_SPACE)
break;
while (s1 < s0)
const char *s0, *s1;
const char **map;
- if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "("))
+ if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "(", !space_split))
return 0;
s0 = *src;
if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] &&
else
{
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
if (**map == *CHR_SPACE)
break;
while (s1 < s0)
int i = 0;
int j = 0;
- if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#"))
+ if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#", !space_split))
return 0;
s0 = *src;
while (*s0)
}
{
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
if (space_split && **map == *CHR_SPACE)
break;
while (s1 < s0)
int i = 0;
int j = 0;
- if (!term_pre (zebra_maps, reg_type, src, "*!", "*!"))
+ if (!term_pre (zebra_maps, reg_type, src, "*!", "*!", !space_split))
return 0;
s0 = *src;
while (*s0)
}
{
s1 = s0;
- map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
if (space_split && **map == *CHR_SPACE)
break;
while (s1 < s0)
while ((len = (cp_end - cp)) > 0)
{
- map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len);
+ map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len, 0);
if (**map == *CHR_SPACE)
space_map = *map;
else
{
char term_dst[IT_MAX_WORD+1];
RSET rset[60], result;
- int rset_no = 0;
+ int rset_no = 0;
struct grep_info grep_info;
char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type);
const char *termp = termz;
RSET result;
RSFD rsfd;
struct it_key key;
- int sys;
rset_temp_parms parms;
parms.rset_term = rset_term_create (termz, -1, rank_type,
result = rset_create (rset_kind_temp, &parms);
rsfd = rset_open (result, RSETF_WRITE);
- sys = atoi(termz);
- if (sys <= 0)
- sys = 1;
-#if IT_KEY_NEW
- key.mem[0] = sys;
- key.mem[1] = 1;
- key.len = 2;
-#else
- key.sysno = sys;
+ key.sysno = atoi (termz);
key.seqno = 1;
if (key.sysno <= 0)
key.sysno = 1;
-#endif
rset_write (result, rsfd, &key);
rset_close (result, rsfd);
return result;
struct scan_info_entry {
char *term;
- ISAMC_P isam_p;
+ ISAMS_P isam_p;
};
struct scan_info {
scan_info->list[idx].term = (char *)
odr_malloc (scan_info->odr, strlen(name + len_prefix)+1);
strcpy (scan_info->list[idx].term, name + len_prefix);
- assert (*info == sizeof(ISAMC_P));
- memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMC_P));
+ assert (*info == sizeof(ISAMS_P));
+ memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMS_P));
return 0;
}
rfd = rset_open (r, RSETF_READ);
while (rset_read (r, rfd, &key, &term_index))
{
-#if IT_KEY_NEW
- if (key.mem[0] != psysno)
- {
- psysno = key.mem[0];
- (*count)++;
- }
-#else
if (key.sysno != psysno)
{
psysno = key.sysno;
(*count)++;
}
-#endif
kno++;
}
rset_close (r, rfd);
# Zebra indexes as referred to from the *.abs-files.
-# $Id: default.idx,v 1.10 2004-07-28 09:40:46 adam Exp $
+# $Id: default.idx,v 1.10.2.1 2004-09-16 14:07:50 adam Exp $
#
# Traditional word index
# Danish/Swedish character map.
#
-# $Id: scan.chr,v 1.1 1999-09-07 07:19:21 adam Exp $
+# $Id: scan.chr,v 1.1.6.1 2004-09-16 14:07:50 adam Exp $
# Define the basic value-set. *Beware* of changing this without re-indexing
# your databases.
map (Å) Å
map (Ö) Ö
+map (^the ) #
+map (^The ) #
+map (^a ) #
+map (^A ) #
+
map éÉ e
map á a
map ó o
-SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort xelm cddb \
+SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort sort2 xelm cddb \
rusmarc zsh marcxml charmap
--- /dev/null
+# $Id: Makefile.am,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $
+
+check_SCRIPTS = test1.sh
+
+TESTS = $(check_SCRIPTS)
+
+EXTRA_DIST = zebra.cfg default.idx \
+ rec1.xml rec2.xml rec3.xml rec4.xml zebra.cfg my.abs sort.chr \
+ $(check_SCRIPTS)
+
--- /dev/null
+# Zebra indexes as referred to from the *.abs-files.
+# $Id: default.idx,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+#
+
+# Traditional word index
+# Used if completenss is 'incomplete field' (@attr 6=1) and
+# structure is word/phrase/word-list/free-form-text/document-text
+index w
+completeness 0
+position 1
+charmap sort.chr
+
+# Phrase index
+# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1)
+# and structure is word/phrase/word-list/free-form-text/document-text
+index p
+completeness 1
+charmap sort.chr
+
+# URX (URL) index
+# Used if structure=urx (@attr 4=104)
+index u
+completeness 0
+charmap urx.chr
+
+# Numeric index
+# Used if structure=numeric (@attr 4=109)
+index n
+completeness 0
+charmap numeric.chr
+
+# Null map index (no mapping at all)
+# Used if structure=key (@attr 4=3)
+index 0
+completeness 0
+position 1
+charmap @
+
+# Year
+# Used if structure=year (@attr 4=4)
+index y
+completeness 0
+charmap @
+
+# Date
+# Used if structure=date (@attr 4=5)
+index d
+completeness 0
+charmap @
+
+# Sort, with prefixes to ignore
+sort s
+completeness 1
+charmap sort.chr
+
--- /dev/null
+# $Id: my.abs,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+
+name my
+reference WAIS-schema
+attset bib1.att
+tagset generic.tag
+xpath enable
+
+varset var1.var
+
+esetname B @
+esetname F @
+
+elm title Title !:p,!:w,!:s
--- /dev/null
+<my>
+ <title>first computer</title>
+</my>
--- /dev/null
+<my>
+ <title>second computer</title>
+</my>
--- /dev/null
+<my>
+ <title>A third computer</title>
+</my>
--- /dev/null
+<my>
+ <title>the fourth computer</title>
+</my>
--- /dev/null
+# character map that removes some leading prefixes
+#
+# $Id: sort.chr,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $
+
+# Define the basic value-set. *Beware* of changing this without re-indexing
+# your databases.
+
+lowercase {0-9}{a-y}üzæäøöå
+uppercase {0-9}{A-Y}ÜZÆÄØÖÅ
+
+# Breaking characters
+
+space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~
+
+# Characters to be considered equivalent for searching purposes.
+
+# equivalent æä(ae)
+# equivalent øö(oe)
+# equivalent å(aa)
+# equivalent uü
+
+map (^The\s) @
+map (^the\s) @
+map (^a\s) @
+map (^A\s) @
+
+#map éÉ e
+#map á a
+#map ó o
+#map í i
+
+#map (Aa) (AA)
+
+#map (aa) a
--- /dev/null
+#!/bin/sh
+# $Id: test1.sh,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+
+pp=${srcdir:-"."}
+
+ulimit -c 10000
+LOG=test1.log
+rm -f $LOG
+rm -fr lock
+mkdir lock
+rm -fr reg
+mkdir reg
+rm -fr recs
+mkdir recs
+cp $pp/rec*.xml recs
+../../index/zebraidx -c $pp/zebra.cfg -l $LOG update recs || exit 1
+../../index/zebrasrv -c $pp/zebra.cfg -l $LOG unix:socket &
+sleep 1
+test -f lock/zebrasrv.pid || exit 2
+../api/testclient -n4 unix:socket '@or computer @attr 7=1 @attr 1=4 0' >tmp1
+
+kill `cat lock/zebrasrv.pid`
+
+echo 'Result count: 4
+my:
+ title: first computer
+my:
+ title: the fourth computer
+my:
+ title: second computer
+my:
+ title: A third computer' >tmp2
+
+diff tmp1 tmp2
--- /dev/null
+# Simple Zebra configuration file
+# $Id: zebra.cfg,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+#
+# Where the schema files, attribute files, etc are located.
+profilePath: ${srcdir:-.}:${srcdir:-.}/../../tab
+
+# Files that describe the attribute sets supported.
+attset: bib1.att
+attset: explain.att
+
+recordtype.xml: grs.sgml
+lockdir: lock
+register: reg:20M
+isam: b
-/* $Id: charmap.c,v 1.29.2.1 2004-08-06 10:08:19 adam Exp $
+/* $Id: charmap.c,v 1.29.2.2 2004-09-16 14:07:51 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
#define CHR_MAXSTR 1024
#define CHR_MAXEQUIV 32
+const unsigned char CHR_FIELD_BEGIN = '^';
+
const char *CHR_UNKNOWN = "\001";
const char *CHR_SPACE = "\002";
const char *CHR_BASE = "\003";
return t->target ? t : 0;
}
-static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
+static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len, int first)
{
chr_t_entry *res;
from++;
len++;
}
- if (*len > 0 && t->children && t->children[(unsigned char) **from])
+ if (*len > 0 && t->children)
{
const char *old_from = *from;
int old_len = *len;
+
+ res = 0;
+
+ if (first && t->children[CHR_FIELD_BEGIN])
+ {
+ if ((res = find_entry_x(t->children[CHR_FIELD_BEGIN], from, len, 0)) && res != t->children[CHR_FIELD_BEGIN])
+ return res;
+ else
+ res = 0;
+ /* otherwhise there was no match on beginning of field, move on */
+ }
- (*len)--;
- (*from)++;
- if ((res = find_entry_x(t->children[(unsigned char) *old_from],
- from, len)))
- return res;
- /* no match */
- *len = old_len;
- *from = old_from;
+ if (!res && t->children[(unsigned char) **from])
+ {
+ (*len)--;
+ (*from)++;
+ if ((res = find_entry_x(t->children[(unsigned char) *old_from],
+ from, len, 0)))
+ return res;
+ /* no match */
+ *len = old_len;
+ *from = old_from;
+ }
}
/* no children match. use ourselves, if we have a target */
return t->target ? t : 0;
}
-const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len)
+const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len, int first)
{
chr_t_entry *t = maptab->input;
chr_t_entry *res;
- if (!(res = find_entry_x(t, from, len)))
+ if (!(res = find_entry_x(t, from, len, first)))
abort();
return (const char **) (res->target);
}
-const char **chr_map_input(chrmaptab maptab, const char **from, int len)
+const char **chr_map_input(chrmaptab maptab, const char **from, int len, int first)
{
chr_t_entry *t = maptab->input;
chr_t_entry *res;
len_tmp[0] = len;
len_tmp[1] = -1;
- if (!(res = find_entry_x(t, from, len_tmp)))
+ if (!(res = find_entry_x(t, from, len_tmp, first)))
abort();
return (const char **) (res->target);
}
ucs4_t i = 0;
char fmtstr[8];
- yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s);
+ yaz_log (LOG_DEBUG, "prim_w %.3s", (char *) *s);
if (**s == '\\')
{
(*s)++;
chrwork *arg = (chrwork *) data;
const char **res, *p = s;
- res = chr_map_input(arg->map, &s, strlen(s));
+ res = chr_map_input(arg->map, &s, strlen(s), 0);
if (*res == (char*) CHR_UNKNOWN)
logf(LOG_WARN, "Map: '%s' has no mapping", p);
strncat(arg->string, *res, CHR_MAXSTR - strlen(arg->string));
char str[1024];
ucs4_t arg[512];
+ ucs4_t arg_prim[512];
ucs4_t *s0, *s = arg;
ucs4_t c, begin, end;
size_t i;
case '[': s++; abort(); break;
case '(':
++s;
- s0 = s;
- while (*s != ')' || s[-1] == '\\')
- s++;
- *s = 0;
- if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1))
+ s0 = s; i = 0;
+ while (*s != ')' || s[-1] == '\\')
+ arg_prim[i++] = zebra_prim_w(&s);
+ arg_prim[i] = 0;
+ if (scan_to_utf8 (t_utf8, arg_prim, zebra_ucs4_strlen(arg_prim), str, sizeof(str)-1))
return -1;
(*fun)(str, data, num ? (*num)++ : 0);
s++;
-/* $Id: zebramap.c,v 1.32 2004-06-16 20:30:47 adam Exp $
+/* $Id: zebramap.c,v 1.32.2.1 2004-09-16 14:07:51 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
}
const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id,
- const char **from, int len)
+ const char **from, int len, int first)
{
chrmaptab maptab;
maptab = zebra_charmap_get (zms, reg_id);
if (maptab)
- return chr_map_input(maptab, from, len);
+ return chr_map_input(maptab, from, len, first);
zms->temp_map_str[0] = **from;