Merge from head the facility that removes leading articles.

author Adam Dickmeiss <adam@indexdata.dk>

Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)
diff --git a/configure.in b/configure.in

index 5dc565f..345c77a 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -1,5 +1,5 @@
  dnl Zebra, Index Data Aps, 1995-2004
-dnl $Id: configure.in,v 1.91.2.2 2004-08-20 11:07:32 adam Exp $
+dnl $Id: configure.in,v 1.91.2.3 2004-09-16 14:07:48 adam Exp $
  dnl
  AC_INIT(include/zebraver.h)
  AM_INIT_AUTOMAKE(idzebra,1.3.18)
@@ -387,7 +387,7 @@ AC_OUTPUT([
    doc/tkl.xsl
    test/Makefile test/gils/Makefile test/usmarc/Makefile test/api/Makefile
    test/rusmarc/Makefile test/cddb/Makefile test/malxml/Makefile 
-  test/config/Makefile
+  test/config/Makefile test/sort2/Makefile
    perl/Makefile.PL test/xelm/Makefile
    test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile
    test/marcxml/Makefile test/charmap/Makefile test/codec/Makefile
diff --git a/doc/recordmodel.xml b/doc/recordmodel.xml

index e052f12..b83fa67 100644 (file)
--- a/doc/recordmodel.xml
+++ b/doc/recordmodel.xml
@@ -1,5 +1,5 @@
   <chapter id="record-model">
-  <!-- $Id: recordmodel.xml,v 1.18 2004-08-04 08:26:43 adam Exp $ -->
+  <!-- $Id: recordmodel.xml,v 1.17.2.1 2004-09-16 14:07:49 adam Exp $ -->
    <title>The Record Model</title>
    
    <para>
@@ -1786,174 +1786,216 @@
       special-purpose fields such as WWW-style linkages (URx).
      </para>
  
-    <para>
-     The field types, and hence character sets, are associated with data
-     elements by the .abs files (see above).
-     The file <literal>default.idx</literal>
-     provides the association between field type codes (as used in the .abs
-     files) and the character map files (with the .chr suffix). The format
-     of the .idx file is as follows
-    </para>
-
-    <para>
-     <variablelist>
-
-      <varlistentry>
-       <term>index <emphasis>field type code</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces a new search index code.
-         The argument is a one-character code to be used in the
-         .abs files to select this particular index type. An index, roughly,
-         corresponds to a particular structure attribute during search. Refer
-         to <xref linkend="search"/>.
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>sort <emphasis>field code type</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces a 
-         sort index. The argument is a one-character code to be used in the
-         .abs fie to select this particular index type. The corresponding
-         use attribute must be used in the sort request to refer to this
-         particular sort index. The corresponding character map (see below)
-         is used in the sort process.
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>completeness <emphasis>boolean</emphasis></term>
-       <listitem>
-        <para>
-         This directive enables or disables complete field indexing.
-         The value of the <emphasis>boolean</emphasis> should be 0
-         (disable) or 1. If completeness is enabled, the index entry will
-         contain the complete contents of the field (up to a limit), with words
-         (non-space characters) separated by single space characters
-         (normalized to " " on display). When completeness is
-         disabled, each word is indexed as a separate entry. Complete subfield
-         indexing is most useful for fields which are typically browsed (eg.
-         titles, authors, or subjects), or instances where a match on a
-         complete subfield is essential (eg. exact title searching). For fields
-         where completeness is disabled, the search engine will interpret a
-         search containing space characters as a word proximity search.
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>charmap <emphasis>filename</emphasis></term>
-       <listitem>
-        <para>
-         This is the filename of the character
-         map to be used for this index for field type.
-        </para>
-       </listitem></varlistentry>
-     </variablelist>
-    </para>
-
-    <para>
-     The contents of the character map files are structured as follows:
-    </para>
-
-    <para>
-     <variablelist>
-
-      <varlistentry>
-       <term>lowercase <emphasis>value-set</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces the basic value set of the field type.
-         The format is an ordered list (without spaces) of the
-         characters which may occur in "words" of the given type.
-         The order of the entries in the list determines the
-         sort order of the index. In addition to single characters, the
-         following combinations are legal:
-        </para>
-
-        <para>
-
-         <itemizedlist>
-          <listitem>
-           <para>
-            Backslashes may be used to introduce three-digit octal, or
-            two-digit hex representations of single characters
-            (preceded by <literal>x</literal>).
-            In addition, the combinations
-            \\, \\r, \\n, \\t, \\s (space &mdash; remember that real
-            space-characters may not occur in the value definition), and
-            \\ are recognized, with their usual interpretation.
-           </para>
-          </listitem>
-
-          <listitem>
-           <para>
-            Curly braces {} may be used to enclose ranges of single
-            characters (possibly using the escape convention described in the
-            preceding point), eg. {a-z} to introduce the
-            standard range of ASCII characters.
-            Note that the interpretation of such a range depends on
-            the concrete representation in your local, physical character set.
-           </para>
-          </listitem>
-
-          <listitem>
-           <para>
-            paranthesises () may be used to enclose multi-byte characters -
-            eg. diacritics or special national combinations (eg. Spanish
-            "ll"). When found in the input stream (or a search term),
-            these characters are viewed and sorted as a single character, with a
-            sorting value depending on the position of the group in the value
-            statement.
-           </para>
-          </listitem>
+    <sect3 id="default-idx-file">
+     <title>The default.idx file</title>
+     <para>
+      The field types, and hence character sets, are associated with data
+      elements by the .abs files (see above).
+      The file <literal>default.idx</literal>
+      provides the association between field type codes (as used in the .abs
+      files) and the character map files (with the .chr suffix). The format
+      of the .idx file is as follows
+     </para>
  
-         </itemizedlist>
+     <para>
+      <variablelist>
+
+       <varlistentry>
+       <term>index <emphasis>field type code</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces a new search index code.
+         The argument is a one-character code to be used in the
+         .abs files to select this particular index type. An index, roughly,
+         corresponds to a particular structure attribute during search. Refer
+         to <xref linkend="search"/>.
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>sort <emphasis>field code type</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces a 
+         sort index. The argument is a one-character code to be used in the
+         .abs fie to select this particular index type. The corresponding
+         use attribute must be used in the sort request to refer to this
+         particular sort index. The corresponding character map (see below)
+         is used in the sort process.
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>completeness <emphasis>boolean</emphasis></term>
+       <listitem>
+        <para>
+         This directive enables or disables complete field indexing.
+         The value of the <emphasis>boolean</emphasis> should be 0
+         (disable) or 1. If completeness is enabled, the index entry will
+         contain the complete contents of the field (up to a limit), with words
+         (non-space characters) separated by single space characters
+         (normalized to " " on display). When completeness is
+         disabled, each word is indexed as a separate entry. Complete subfield
+         indexing is most useful for fields which are typically browsed (eg.
+         titles, authors, or subjects), or instances where a match on a
+         complete subfield is essential (eg. exact title searching). For fields
+         where completeness is disabled, the search engine will interpret a
+         search containing space characters as a word proximity search.
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>charmap <emphasis>filename</emphasis></term>
+       <listitem>
+        <para>
+         This is the filename of the character
+         map to be used for this index for field type.
+        </para>
+       </listitem></varlistentry>
+      </variablelist>
+     </para>
+    </sect3>
  
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>uppercase <emphasis>value-set</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces the
-         upper-case equivalencis to the value set (if any). The number and
-         order of the entries in the list should be the same as in the
-         <literal>lowercase</literal> directive.
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>space <emphasis>value-set</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces the character
-         which separate words in the input stream. Depending on the
-         completeness mode of the field in question, these characters either
-         terminate an index entry, or delimit individual "words" in
-         the input stream. The order of the elements is not significant &mdash;
-         otherwise the representation is the same as for the
-         <literal>uppercase</literal> and <literal>lowercase</literal>
-         directives.
-        </para>
-       </listitem></varlistentry>
-      <varlistentry>
-       <term>map <emphasis>value-set</emphasis>
-        <emphasis>target</emphasis></term>
-       <listitem>
-        <para>
-         This directive introduces a
-         mapping between each of the members of the value-set on the left to
-         the character on the right. The character on the right must occur in
-         the value set (the <literal>lowercase</literal> directive) of
-         the character set, but
-         it may be a paranthesis-enclosed multi-octet character. This directive
-         may be used to map diacritics to their base characters, or to map
-         HTML-style character-representations to their natural form, etc.
-        </para>
-       </listitem></varlistentry>
-     </variablelist>
-    </para>
+    <sect3 id="character-map-files">
+     <title>The character map file format</title>
+     <para>
+      The contents of the character map files are structured as follows:
+     </para>
  
+     <para>
+      <variablelist>
+
+       <varlistentry>
+       <term>lowercase <emphasis>value-set</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces the basic value set of the field type.
+         The format is an ordered list (without spaces) of the
+         characters which may occur in "words" of the given type.
+         The order of the entries in the list determines the
+         sort order of the index. In addition to single characters, the
+         following combinations are legal:
+        </para>
+
+        <para>
+
+         <itemizedlist>
+          <listitem>
+           <para>
+            Backslashes may be used to introduce three-digit octal, or
+            two-digit hex representations of single characters
+            (preceded by <literal>x</literal>).
+            In addition, the combinations
+            \\, \\r, \\n, \\t, \\s (space &mdash; remember that real
+            space-characters may not occur in the value definition), and
+            \\ are recognized, with their usual interpretation.
+           </para>
+          </listitem>
+
+          <listitem>
+           <para>
+            Curly braces {} may be used to enclose ranges of single
+            characters (possibly using the escape convention described in the
+            preceding point), eg. {a-z} to introduce the
+            standard range of ASCII characters.
+            Note that the interpretation of such a range depends on
+            the concrete representation in your local, physical character set.
+           </para>
+          </listitem>
+
+          <listitem>
+           <para>
+            paranthesises () may be used to enclose multi-byte characters -
+            eg. diacritics or special national combinations (eg. Spanish
+            "ll"). When found in the input stream (or a search term),
+            these characters are viewed and sorted as a single character, with a
+            sorting value depending on the position of the group in the value
+            statement.
+           </para>
+          </listitem>
+
+         </itemizedlist>
+
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>uppercase <emphasis>value-set</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces the
+         upper-case equivalencis to the value set (if any). The number and
+         order of the entries in the list should be the same as in the
+         <literal>lowercase</literal> directive.
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>space <emphasis>value-set</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces the character
+         which separate words in the input stream. Depending on the
+         completeness mode of the field in question, these characters either
+         terminate an index entry, or delimit individual "words" in
+         the input stream. The order of the elements is not significant &mdash;
+         otherwise the representation is the same as for the
+         <literal>uppercase</literal> and <literal>lowercase</literal>
+         directives.
+        </para>
+       </listitem></varlistentry>
+       <varlistentry>
+       <term>map <emphasis>value-set</emphasis>
+        <emphasis>target</emphasis></term>
+       <listitem>
+        <para>
+         This directive introduces a
+         mapping between each of the members of the value-set on the left to
+         the character on the right. The character on the right must occur in
+         the value set (the <literal>lowercase</literal> directive) of
+         the character set, but
+         it may be a paranthesis-enclosed multi-octet character. This directive
+         may be used to map diacritics to their base characters, or to map
+         HTML-style character-representations to their natural form, etc. The map directive
+         can also be used to ignore leading articles in searching and/or sorting, and to perform
+         other special transformations. See section <xref linkend="leading-articles"/>.
+        </para>
+       </listitem></varlistentry>
+      </variablelist>
+     </para>
+    </sect3>
+    <sect3 id="leading-articles">
+     <title>Ignoring leading articles</title>
+     <para>
+      In addition to specifying sort orders, space (blank) handling, and upper/lowercase folding,
+      you can also use the character map files to make Zebra ignore leading articles in sorting
+      records, or when doing complete field searching.
+     </para>
+     <para>
+      This is done using the <literal>map</literal> directive in the character map file. In a
+      nutshell, what you do is map certain sequences of characters, when they occur <emphasis>
+      in the beginning of a field</emphasis>, to a space. Assuming that the character "@" is
+      defined as a space character in your file, you can do:
+      <screen>
+       map (^The\s) @
+       map (^the\s) @
+      </screen>
+      The effect of these directives is to map either 'the' or 'The', followed by a space
+      character, to a space. The hat ^ character denotes beginning-of-field only when
+      complete-subfield indexing or sort indexing is taking place; otherwise, it is treated just
+      as any other character.
+     </para>
+     <para>
+      Because the <literal>default.idx</literal> file can be used to associate different
+      character maps with different indexing types -- and you can create additional indexing
+      types, should the need arise -- it is possible to specify that leading articles should be
+      ignored either in sorting, in complete-field searching, or both.
+     </para>
+     <para>
+      If you ignore certain prefixes in sorting, then these will be eliminated from the index,
+      and sorting will take place as if they weren't there. However, if you set the system up
+      to ignore certain prefixes in <emphasis>searching</emphasis>, then these are deleted both
+      from the indexes and from query terms, when the client specifies complete-field
+      searching. This has the effect that a search for 'the science journal' and 'science
+      journal' would both produce the same results.
+     </para>
+    </sect3>
     </sect2>
-
    </sect1>
  
    <sect1 id="formats">
diff --git a/include/charmap.h b/include/charmap.h

index 365b6ab..facf776 100644 (file)
--- a/include/charmap.h
+++ b/include/charmap.h
@@ -1,4 +1,4 @@
-/* $Id: charmap.h,v 1.9 2004-07-28 09:47:41 adam Exp $
+/* $Id: charmap.h,v 1.9.2.1 2004-09-16 14:07:49 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -43,9 +43,9 @@ YAZ_EXPORT chrmaptab chrmaptab_create(const char *tabpath, const char *name,
                                       int map_only, const char *tabroot);
  YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab);
  
-YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len);
+YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len, int first);
  YAZ_EXPORT const char **chr_map_input_x(chrmaptab t,
-                                       const char **from, int *len);
+                                       const char **from, int *len, int first);
  YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab,
                                         const char **from, int len,
                                         const char **qmap);
diff --git a/include/zebramap.h b/include/zebramap.h

index 62845c0..60e8ebc 100644 (file)
--- a/include/zebramap.h
+++ b/include/zebramap.h
@@ -1,4 +1,4 @@
-/* $Id: zebramap.h,v 1.15 2004-07-28 09:47:41 adam Exp $
+/* $Id: zebramap.h,v 1.15.2.1 2004-09-16 14:07:49 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -34,7 +34,7 @@ ZebraMaps zebra_maps_open (Res res, const char *base);
  void zebra_maps_close (ZebraMaps zm);
  
  const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id,
-                              const char **from, int len);
+                              const char **from, int len, int first);
  const char *zebra_maps_output(ZebraMaps, unsigned reg_id, const char **from);
  
  int zebra_maps_attr (ZebraMaps zms, Z_AttributesPlusTerm *zapt,
diff --git a/index/extract.c b/index/extract.c

index 9561c6e..1b8fdb7 100644 (file)
--- a/index/extract.c
+++ b/index/extract.c
@@ -1,4 +1,4 @@
-/* $Id: extract.c,v 1.158 2004-08-04 08:35:23 adam Exp $
+/* $Id: extract.c,v 1.157.2.1 2004-09-16 14:07:50 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -20,6 +20,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  02111-1307, USA.
  */
  
+
  #include <stdio.h>
  #include <assert.h>
  #include <ctype.h>
@@ -92,51 +93,6 @@ static const char **searchRecordKey (ZebraHandle zh,
                                       struct recKeys *reckeys,
                                      int attrSetS, int attrUseS)
  {
-#if IT_KEY_NEW
-/* #error searchRecordKey does not work yet in this mode.. */
-    static const char *ws[32];
-    void *decode_handle = iscz1_start();
-    int off = 0;
-    int startSeq = -1;
-    int seqno = 0;
-    int i;
-
-    for (i = 0; i<32; i++)
-        ws[i] = NULL;
-
-    while (off < reckeys->buf_used)
-    {
-        const char *src = reckeys->buf + off;
-        struct it_key key;
-       char *dst = (char*) &key;
-       int attrSet, attrUse;
-
-       iscz1_decode(decode_handle, &dst, &src);
-       assert(key.len < 4 && key.len > 2);
-
-       attrSet = key.mem[0];
-       attrUse = key.mem[1];
-       seqno = key.mem[2];
-
-       if (attrUseS == attrUse && attrSetS == attrSet)
-        {
-            int woff;
-
-            if (startSeq == -1)
-                startSeq = seqno;
-            woff = seqno - startSeq;
-            if (woff >= 0 && woff < 31)
-                ws[woff] = src;
-        }
-
-        while (*src++)
-           ;
-        off = src - reckeys->buf;
-    }
-    iscz1_stop(decode_handle);
-    assert (off == reckeys->buf_used);
-    return ws;
-#else
      static const char *ws[32];
      int off = 0;
      int startSeq = -1;
@@ -215,7 +171,6 @@ static const char **searchRecordKey (ZebraHandle zh,
      }
      assert (off == reckeys->buf_used);
      return ws;
-#endif
  }
  
  struct file_read_info {
@@ -460,18 +415,6 @@ struct recordLogInfo {
      int recordOffset;
      struct recordGroup *rGroup;
  };
-
-void create_rec_keys_codec(struct recKeys *keys)
-{
-    keys->buf_used = 0;
-#if IT_KEY_NEW
-    iscz1_reset(keys->codec_handle);
-#else
-    keys->prevAttrUse = -1;
-    keys->prevAttrSet = -1;
-    keys->prevSeqNo = 0;
-#endif
-}
       
  static int file_extract_record(ZebraHandle zh,
                                SYSNO *sysno, const char *fname,
@@ -512,8 +455,10 @@ static int file_extract_record(ZebraHandle zh,
          /* we are going to read from a file, so prepare the extraction */
         int i;
  
-       create_rec_keys_codec(&zh->reg->keys);
-
+       zh->reg->keys.buf_used = 0;
+       zh->reg->keys.prevAttrUse = -1;
+       zh->reg->keys.prevAttrSet = -1;
+       zh->reg->keys.prevSeqNo = 0;
         zh->reg->sortKeys.buf_used = 0;
         
         recordOffset = fi->file_moffset;
@@ -606,10 +551,7 @@ static int file_extract_record(ZebraHandle zh,
              {
                  rinfo = dict_lookup (zh->reg->matchDict, matchStr);
                  if (rinfo)
-               {
-                   assert(*rinfo == sizeof(*sysno));
                      memcpy (sysno, rinfo+1, sizeof(*sysno));
-               }
              }
              else
              {
@@ -900,7 +842,7 @@ int buffer_extract_record (ZebraHandle zh,
                            int delete_flag,
                            int test_mode, 
                            const char *recordType,
-                          SYSNO *sysno,
+                          int *sysno,
                            const char *match_criteria,
                            const char *fname,
                            int force_update,
@@ -935,8 +877,10 @@ int buffer_extract_record (ZebraHandle zh,
      extractCtrl.endf = zebra_record_int_end;
      extractCtrl.fh = &fc;
  
-    create_rec_keys_codec(&zh->reg->keys);
-
+    zh->reg->keys.buf_used = 0;
+    zh->reg->keys.prevAttrUse = -1;
+    zh->reg->keys.prevAttrSet = -1;
+    zh->reg->keys.prevSeqNo = 0;
      zh->reg->sortKeys.buf_used = 0;
  
      if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
@@ -1029,10 +973,7 @@ int buffer_extract_record (ZebraHandle zh,
          if (matchStr) {
              rinfo = dict_lookup (zh->reg->matchDict, matchStr);
              if (rinfo)
-           {
-               assert(*rinfo == sizeof(*sysno));
                  memcpy (sysno, rinfo+1, sizeof(*sysno));
-           }
          }
      }
  
@@ -1237,8 +1178,10 @@ int explain_extract (void *handle, Record rec, data1_node *n)
              abort ();
      }
  
-    create_rec_keys_codec(&zh->reg->keys);
-
+    zh->reg->keys.buf_used = 0;
+    zh->reg->keys.prevAttrUse = -1;
+    zh->reg->keys.prevAttrSet = -1;
+    zh->reg->keys.prevSeqNo = 0;
      zh->reg->sortKeys.buf_used = 0;
      
      extractCtrl.init = extract_init;
@@ -1289,16 +1232,12 @@ int explain_extract (void *handle, Record rec, data1_node *n)
  void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
                                int cmd, struct recKeys *reckeys)
  {
-#if IT_KEY_NEW
-    void *decode_handle = iscz1_start();
-#else
-    int seqno = 0;
  #if SU_SCHEME
  #else
      unsigned char attrSet = (unsigned char) -1;
      unsigned short attrUse = (unsigned short) -1;
  #endif
-#endif
+    int seqno = 0;
      int off = 0;
      int ch = 0;
      ZebraExplainInfo zei = zh->reg->zei;
@@ -1320,53 +1259,6 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
         zh->reg->key_file_no = 0;
      }
      zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
-#if IT_KEY_NEW
-    while (off < reckeys->buf_used)
-    {
-        const char *src = reckeys->buf + off;
-        struct it_key key;
-       char *dst = (char*) &key;
-       int attrSet, attrUse;
-
-       iscz1_decode(decode_handle, &dst, &src);
-       assert(key.len < 4 && key.len > 2);
-
-       attrSet = key.mem[0];
-       attrUse = key.mem[1]; /* sequence in mem[2] */
-
-        if (zh->reg->key_buf_used + 1024 > 
-            (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))
-            extract_flushWriteKeys (zh,0);
-        assert(zh->reg->ptr_i >= 0);
-        ++(zh->reg->ptr_i);
-        assert(zh->reg->ptr_i > 0);
-        (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =
-           (char*)zh->reg->key_buf + zh->reg->key_buf_used;
-
-        ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
-        if (ch < 0)
-            ch = zebraExplain_addSU (zei, attrSet, attrUse);
-
-        assert (ch > 0);
-       zh->reg->key_buf_used +=
-           key_SU_encode (ch,((char*)zh->reg->key_buf) +
-                           zh->reg->key_buf_used);
-        while (*src)
-            ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;
-        src++;
-        ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';
-        ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;
-
-        key.len = 2;
-       key.mem[0] = sysno;
-       key.mem[1] = key.mem[2];  /* sequence .. */
-       
-        memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used,
-               &key, sizeof(key));
-        (zh->reg->key_buf_used) += sizeof(key);
-        off = src - reckeys->buf;
-    }
-#else
      while (off < reckeys->buf_used)
      {
          const char *src = reckeys->buf + off;
@@ -1431,11 +1323,7 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
          (zh->reg->key_buf_used) += sizeof(key);
          off = src - reckeys->buf;
      }
-#endif
      assert (off == reckeys->buf_used);
-#if IT_KEY_NEW
-    iscz1_stop(decode_handle);
-#endif
  }
  
  void extract_flushWriteKeys (ZebraHandle zh, int final)
@@ -1572,22 +1460,18 @@ void extract_flushWriteKeys (ZebraHandle zh, int final)
      zh->reg->key_buf_used = 0;
  }
  
-void extract_add_index_string (RecWord *p, const char *str, int length)
+void extract_add_index_string (RecWord *p, const char *string,
+                               int length)
  {
      char *dst;
-    ZebraHandle zh = p->extractCtrl->handle;
-    struct recKeys *keys = &zh->reg->keys;
-#if IT_KEY_NEW
-    struct it_key key;
-    const char *src = (char*) &key;
-#else
      unsigned char attrSet;
      unsigned short attrUse;
      int lead = 0;
      int diff = 0;
      int *pseqno = &p->seqno;
+    ZebraHandle zh = p->extractCtrl->handle;
      ZebraExplainInfo zei = zh->reg->zei;
-#endif
+    struct recKeys *keys = &zh->reg->keys;
      
      if (keys->buf_used+1024 > keys->buf_max)
      {
@@ -1601,25 +1485,6 @@ void extract_add_index_string (RecWord *p, const char *str, int length)
      }
      dst = keys->buf + keys->buf_used;
  
-#if IT_KEY_NEW
-    key.len = 3;
-    key.mem[0] = p->attrSet;
-    key.mem[1] = p->attrUse;
-    key.mem[2] = p->seqno;
-
-#if 0
-    /* just for debugging .. */
-    yaz_log(LOG_LOG, "set=%d use=%d seqno=%d", p->attrSet, p->attrUse,
-           p->seqno);
-#endif
-
-    iscz1_encode(keys->codec_handle, &dst, &src);
-
-    *dst++ = p->reg_type;
-    memcpy (dst, str, length);
-    dst += length;
-    *dst++ = '\0';
-#else
      /* leader byte is encoded as follows:
         bit 0 : 1 if attrset is unchanged; 0 if attrset is changed
         bit 1 : 1 if attruse is unchanged; 0 if attruse is changed
@@ -1634,13 +1499,13 @@ void extract_add_index_string (RecWord *p, const char *str, int length)
          lead |= 2;
      else
          keys->prevAttrUse = attrUse;
-
+#if 1
      diff = 1 + *pseqno - keys->prevSeqNo;
      if (diff >= 1 && diff <= 15)
          lead |= (diff << 2);
      else
          diff = 0;
-
+#endif
      keys->prevSeqNo = *pseqno;
      
      *dst++ = lead;
@@ -1672,7 +1537,7 @@ void extract_add_index_string (RecWord *p, const char *str, int length)
      }
  #endif
      *dst++ = p->reg_type;
-    memcpy (dst, str, length);
+    memcpy (dst, string, length);
      dst += length;
      *dst++ = '\0';
  
@@ -1681,11 +1546,10 @@ void extract_add_index_string (RecWord *p, const char *str, int length)
          memcpy (dst, pseqno, sizeof(*pseqno));
          dst += sizeof(*pseqno);
      }
-#endif
      keys->buf_used = dst - keys->buf;
  }
  
-static void extract_add_sort_string (RecWord *p, const char *str,
+static void extract_add_sort_string (RecWord *p, const char *string,
                                      int length)
  {
      ZebraHandle zh = p->extractCtrl->handle;
@@ -1718,7 +1582,7 @@ static void extract_add_sort_string (RecWord *p, const char *str,
      off += key_SU_encode(p->attrSet, sk->buf + off);
      off += key_SU_encode(p->attrUse, sk->buf + off);
      off += key_SU_encode(length, sk->buf + off);
-    memcpy (sk->buf + off, str, length);
+    memcpy (sk->buf + off, string, length);
      sk->buf_used = off + length;
  }
  
@@ -1738,7 +1602,7 @@ static void extract_add_incomplete_field (RecWord *p)
      const char **map = 0;
  
      if (remain > 0)
-       map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+       map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
  
      while (map)
      {
@@ -1750,7 +1614,7 @@ static void extract_add_incomplete_field (RecWord *p)
         {
             remain = p->length - (b - p->string);
             if (remain > 0)
-               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
             else
                 map = 0;
         }
@@ -1765,7 +1629,7 @@ static void extract_add_incomplete_field (RecWord *p)
                 buf[i++] = *(cp++);
             remain = p->length - (b - p->string);
             if (remain > 0)
-               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
             else
                 map = 0;
         }
@@ -1782,9 +1646,12 @@ static void extract_add_complete_field (RecWord *p)
      char buf[IT_MAX_WORD+1];
      const char **map = 0;
      int i = 0, remain = p->length;
+    int first; /* first position */
+
+yaz_log(LOG_DEBUG, "Complete field, w='%s'", p->string);
  
      if (remain > 0)
-       map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
+       map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain, 1);
  
      while (remain > 0 && i < IT_MAX_WORD)
      {
@@ -1793,7 +1660,10 @@ static void extract_add_complete_field (RecWord *p)
             remain = p->length - (b - p->string);
  
             if (remain > 0)
-               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+           {
+               first = i ? 0 : 1;
+               map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, first);
+           }
             else
                 map = 0;
         }
@@ -1814,13 +1684,16 @@ static void extract_add_complete_field (RecWord *p)
             {
                 if (i >= IT_MAX_WORD)
                     break;
+yaz_log(LOG_DEBUG, "Adding string to index '%d'", *map);
                 while (i < IT_MAX_WORD && *cp)
                     buf[i++] = *(cp++);
             }
             remain = p->length  - (b - p->string);
             if (remain > 0)
+           {
                 map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
-                                       remain);
+                                       remain, 0);
+           }
             else
                 map = 0;
         }
@@ -1891,13 +1764,8 @@ void encode_key_init (struct encode_info *i)
      i->prevseq=0;
      i->prevcmd=-1;
      i->keylen=0;
-#if IT_KEY_NEW
-    i->encode_handle = iscz1_start();
-#endif
  }
  
-#if IT_KEY_NEW
-#else
  char *encode_key_int (int d, char *bp)
  {
      if (d <= 63)
@@ -1922,8 +1790,6 @@ char *encode_key_int (int d, char *bp)
      }
      return bp;
  }
-#endif
-
  #define OLDENCODE 1
  
  #ifdef OLDENCODE
@@ -1934,19 +1800,11 @@ char *encode_key_int (int d, char *bp)
  void encode_key_write (char *k, struct encode_info *i, FILE *outf)
  {
      struct it_key key;
-    char *bp = i->buf, *bp0;
-    const char *src = (char *) &key;
+    char *bp = i->buf;
  
-    /* copy term to output buf */
      while ((*bp++ = *k++))
          ;
-    /* and copy & align key so we can mangle */
-    memcpy (&key, k+1, sizeof(struct it_key));  /* *k is insert/delete */
-#if IT_KEY_NEW
-    bp0 = bp++;
-    iscz1_encode(i->encode_handle, &bp, &src);
-    *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */
-#else
+    memcpy (&key, k+1, sizeof(struct it_key));
      bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
      if (i->sysno != key.sysno)
      {
@@ -1958,7 +1816,6 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf)
      bp = encode_key_int (key.seqno - i->seqno, bp);
      i->seqno = key.seqno;
      i->cmd = *k;
-#endif
      if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
      {
          logf (LOG_FATAL|LOG_ERRNO, "fwrite");
@@ -1968,9 +1825,6 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf)
  
  void encode_key_flush (struct encode_info *i, FILE *outf)
  { /* dummy routine */
-#if IT_KEY_NEW
-    iscz1_stop(i->encode_handle);
-#endif
  }
  
  #else
diff --git a/index/zrpn.c b/index/zrpn.c

index 7fefb9b..7f1411f 100644 (file)
--- a/index/zrpn.c
+++ b/index/zrpn.c
@@ -1,4 +1,4 @@
-/* $Id: zrpn.c,v 1.142 2004-08-04 08:35:23 adam Exp $
+/* $Id: zrpn.c,v 1.141.2.1 2004-09-16 14:07:50 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -56,7 +56,7 @@ typedef struct {
  static const char **rpn_char_map_handler (void *vp, const char **from, int len)
  {
      struct rpn_char_map_info *p = (struct rpn_char_map_info *) vp;
-    const char **out = zebra_maps_input (p->zm, p->reg_type, from, len);
+    const char **out = zebra_maps_input (p->zm, p->reg_type, from, len, 0);
  #if 0
      if (out && *out)
      {
@@ -164,7 +164,7 @@ struct grep_info {
  #ifdef TERM_COUNT        
      int *term_no;        
  #endif        
-    ISAMC_P *isam_p_buf;
+    ISAMS_P *isam_p_buf;
      int isam_p_size;        
      int isam_p_indx;
      ZebraHandle zh;
@@ -194,12 +194,12 @@ static void add_isam_p (const char *name, const char *info,
  {
      if (p->isam_p_indx == p->isam_p_size)
      {
-        ISAMC_P *new_isam_p_buf;
+        ISAMS_P *new_isam_p_buf;
  #ifdef TERM_COUNT        
          int *new_term_no;        
  #endif
          p->isam_p_size = 2*p->isam_p_size + 100;
-        new_isam_p_buf = (ISAMC_P *) xmalloc (sizeof(*new_isam_p_buf) *
+        new_isam_p_buf = (ISAMS_P *) xmalloc (sizeof(*new_isam_p_buf) *
                                              p->isam_p_size);
          if (p->isam_p_buf)
          {
@@ -253,7 +253,7 @@ static int grep_handle (char *name, const char *info, void *p)
  }
  
  static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src,
-                    const char *ct1, const char *ct2)
+                    const char *ct1, const char *ct2, int first)
  {
      const char *s1, *s0 = *src;
      const char **map;
@@ -266,7 +266,7 @@ static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src,
          if (ct2 && strchr (ct2, *s0))
              break;
          s1 = s0;
-        map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1));
+        map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1), first);
          if (**map != *CHR_SPACE)
              break;
          s0 = s1;
@@ -290,13 +290,13 @@ static int term_100 (ZebraMaps zebra_maps, int reg_type,
      const char *space_start = 0;
      const char *space_end = 0;
  
-    if (!term_pre (zebra_maps, reg_type, src, NULL, NULL))
+    if (!term_pre (zebra_maps, reg_type, src, NULL, NULL, !space_split))
          return 0;
      s0 = *src;
      while (*s0)
      {
          s1 = s0;
-        map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+        map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
         if (space_split)
         {
             if (**map == *CHR_SPACE)
@@ -348,7 +348,7 @@ static int term_101 (ZebraMaps zebra_maps, int reg_type,
      int i = 0;
      int j = 0;
  
-    if (!term_pre (zebra_maps, reg_type, src, "#", "#"))
+    if (!term_pre (zebra_maps, reg_type, src, "#", "#", !space_split))
          return 0;
      s0 = *src;
      while (*s0)
@@ -362,7 +362,7 @@ static int term_101 (ZebraMaps zebra_maps, int reg_type,
          else
          {
              s1 = s0;
-            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
              if (space_split && **map == *CHR_SPACE)
                  break;
              while (s1 < s0)
@@ -390,7 +390,7 @@ static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src,
      const char *s0, *s1;
      const char **map;
  
-    if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "("))
+    if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "(", !space_split))
          return 0;
      s0 = *src;
      if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] &&
@@ -411,7 +411,7 @@ static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src,
          else
          {
              s1 = s0;
-            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
              if (**map == *CHR_SPACE)
                  break;
              while (s1 < s0)
@@ -448,7 +448,7 @@ static int term_104 (ZebraMaps zebra_maps, int reg_type,
      int i = 0;
      int j = 0;
  
-    if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#"))
+    if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#", !space_split))
          return 0;
      s0 = *src;
      while (*s0)
@@ -491,7 +491,7 @@ static int term_104 (ZebraMaps zebra_maps, int reg_type,
         }
          {
              s1 = s0;
-            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
              if (space_split && **map == *CHR_SPACE)
                  break;
              while (s1 < s0)
@@ -519,7 +519,7 @@ static int term_105 (ZebraMaps zebra_maps, int reg_type,
      int i = 0;
      int j = 0;
  
-    if (!term_pre (zebra_maps, reg_type, src, "*!", "*!"))
+    if (!term_pre (zebra_maps, reg_type, src, "*!", "*!", !space_split))
          return 0;
      s0 = *src;
      while (*s0)
@@ -537,7 +537,7 @@ static int term_105 (ZebraMaps zebra_maps, int reg_type,
         }
          {
              s1 = s0;
-            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+            map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0);
              if (space_split && **map == *CHR_SPACE)
                  break;
              while (s1 < s0)
@@ -1235,7 +1235,7 @@ static int trans_scan_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
              
          while ((len = (cp_end - cp)) > 0)
          {
-            map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len);
+            map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len, 0);
              if (**map == *CHR_SPACE)
                  space_map = *map;
              else
@@ -1365,7 +1365,7 @@ static RSET rpn_search_APT_phrase (ZebraHandle zh,
  {
      char term_dst[IT_MAX_WORD+1];
      RSET rset[60], result;
-    int rset_no = 0;
+    int  rset_no = 0;
      struct grep_info grep_info;
      char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type);
      const char *termp = termz;
@@ -1787,7 +1787,6 @@ static RSET rpn_search_APT_local (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
      RSET result;
      RSFD rsfd;
      struct it_key key;
-    int sys;
      rset_temp_parms parms;
  
      parms.rset_term = rset_term_create (termz, -1, rank_type,
@@ -1798,19 +1797,10 @@ static RSET rpn_search_APT_local (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
      result = rset_create (rset_kind_temp, &parms);
      rsfd = rset_open (result, RSETF_WRITE);
  
-    sys = atoi(termz);
-    if (sys <= 0)
-       sys = 1;
-#if IT_KEY_NEW
-    key.mem[0] = sys;
-    key.mem[1] = 1;
-    key.len = 2;
-#else
-    key.sysno = sys;
+    key.sysno = atoi (termz);
      key.seqno = 1;
      if (key.sysno <= 0)
          key.sysno = 1;
-#endif
      rset_write (result, rsfd, &key);
      rset_close (result, rsfd);
      return result;
@@ -2400,7 +2390,7 @@ RSET rpn_search (ZebraHandle zh, NMEM nmem,
  
  struct scan_info_entry {
      char *term;
-    ISAMC_P isam_p;
+    ISAMS_P isam_p;
  };
  
  struct scan_info {
@@ -2424,8 +2414,8 @@ static int scan_handle (char *name, const char *info, int pos, void *client)
      scan_info->list[idx].term = (char *)
         odr_malloc (scan_info->odr, strlen(name + len_prefix)+1);
      strcpy (scan_info->list[idx].term, name + len_prefix);
-    assert (*info == sizeof(ISAMC_P));
-    memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMC_P));
+    assert (*info == sizeof(ISAMS_P));
+    memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMS_P));
      return 0;
  }
  
@@ -2475,19 +2465,11 @@ static void count_set (RSET r, int *count)
      rfd = rset_open (r, RSETF_READ);
      while (rset_read (r, rfd, &key, &term_index))
      {
-#if IT_KEY_NEW
-        if (key.mem[0] != psysno)
-        {
-            psysno = key.mem[0];
-            (*count)++;
-        }
-#else
          if (key.sysno != psysno)
          {
              psysno = key.sysno;
              (*count)++;
          }
-#endif
          kno++;
      }
      rset_close (r, rfd);
diff --git a/tab/default.idx b/tab/default.idx

index 9e2cb81..e146213 100644 (file)
--- a/tab/default.idx
+++ b/tab/default.idx
@@ -1,5 +1,5 @@
  # Zebra indexes as referred to from the *.abs-files.
-#  $Id: default.idx,v 1.10 2004-07-28 09:40:46 adam Exp $
+#  $Id: default.idx,v 1.10.2.1 2004-09-16 14:07:50 adam Exp $
  #
  
  # Traditional word index
diff --git a/tab/scan.chr b/tab/scan.chr

index 599dd7c..26e0f45 100644 (file)
--- a/tab/scan.chr
+++ b/tab/scan.chr
@@ -1,6 +1,6 @@
  # Danish/Swedish character map.
  #
-# $Id: scan.chr,v 1.1 1999-09-07 07:19:21 adam Exp $
+# $Id: scan.chr,v 1.1.6.1 2004-09-16 14:07:50 adam Exp $
  
  # Define the basic value-set. *Beware* of changing this without re-indexing
  # your databases.
@@ -32,6 +32,11 @@ map (&Oslash;)     
  map (&Aring;)      Å
  map (&Ouml;)       Ö
  
+map (^the )    #
+map (^The )    #
+map (^a )       #
+map (^A )      #
+
  map éÉ         e
  map á          a
  map ó          o
diff --git a/test/Makefile.am b/test/Makefile.am

index f61aaba..4682121 100644 (file)
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,4 +1,4 @@
  
-SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort xelm cddb \
+SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort sort2 xelm cddb \
   rusmarc zsh marcxml charmap
  
diff --git a/test/sort2/Makefile.am b/test/sort2/Makefile.am

new file mode 100644 (file)

index 0000000..473f338
--- /dev/null
+++ b/test/sort2/Makefile.am
@@ -0,0 +1,10 @@
+# $Id: Makefile.am,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $
+
+check_SCRIPTS = test1.sh
+
+TESTS = $(check_SCRIPTS)
+
+EXTRA_DIST = zebra.cfg default.idx \
+ rec1.xml rec2.xml rec3.xml rec4.xml zebra.cfg my.abs sort.chr \
+ $(check_SCRIPTS)
+
diff --git a/test/sort2/default.idx b/test/sort2/default.idx

new file mode 100644 (file)

index 0000000..cb378f5
--- /dev/null
+++ b/test/sort2/default.idx
@@ -0,0 +1,55 @@
+# Zebra indexes as referred to from the *.abs-files.
+#  $Id: default.idx,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+#
+
+# Traditional word index
+# Used if completenss is 'incomplete field' (@attr 6=1) and
+# structure is word/phrase/word-list/free-form-text/document-text
+index w
+completeness 0
+position 1
+charmap sort.chr
+
+# Phrase index
+# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1)
+# and structure is word/phrase/word-list/free-form-text/document-text
+index p
+completeness 1
+charmap sort.chr
+
+# URX (URL) index
+# Used if structure=urx (@attr 4=104)
+index u
+completeness 0
+charmap urx.chr
+
+# Numeric index
+# Used if structure=numeric (@attr 4=109)
+index n
+completeness 0
+charmap numeric.chr
+
+# Null map index (no mapping at all)
+# Used if structure=key (@attr 4=3)
+index 0
+completeness 0
+position 1
+charmap @
+
+# Year
+# Used if structure=year (@attr 4=4)
+index y
+completeness 0
+charmap @
+
+# Date
+# Used if structure=date (@attr 4=5)
+index d
+completeness 0
+charmap @
+
+# Sort, with prefixes to ignore
+sort s
+completeness 1
+charmap sort.chr
+
diff --git a/test/sort2/my.abs b/test/sort2/my.abs

new file mode 100644 (file)

index 0000000..36dd566
--- /dev/null
+++ b/test/sort2/my.abs
@@ -0,0 +1,14 @@
+# $Id: my.abs,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+
+name my
+reference WAIS-schema
+attset bib1.att
+tagset generic.tag
+xpath enable
+
+varset var1.var
+
+esetname B @
+esetname F @
+
+elm title              Title                   !:p,!:w,!:s
diff --git a/test/sort2/rec1.xml b/test/sort2/rec1.xml

new file mode 100644 (file)

index 0000000..6dbf26f
--- /dev/null
+++ b/test/sort2/rec1.xml
@@ -0,0 +1,3 @@
+<my>
+  <title>first computer</title>
+</my>
diff --git a/test/sort2/rec2.xml b/test/sort2/rec2.xml

new file mode 100644 (file)

index 0000000..23bb030
--- /dev/null
+++ b/test/sort2/rec2.xml
@@ -0,0 +1,3 @@
+<my>
+  <title>second computer</title>
+</my>
diff --git a/test/sort2/rec3.xml b/test/sort2/rec3.xml

new file mode 100644 (file)

index 0000000..245c6e1
--- /dev/null
+++ b/test/sort2/rec3.xml
@@ -0,0 +1,3 @@
+<my>
+  <title>A third computer</title>
+</my>
diff --git a/test/sort2/rec4.xml b/test/sort2/rec4.xml

new file mode 100644 (file)

index 0000000..8ed6c2c
--- /dev/null
+++ b/test/sort2/rec4.xml
@@ -0,0 +1,3 @@
+<my>
+  <title>the fourth computer</title>
+</my>
diff --git a/test/sort2/sort.chr b/test/sort2/sort.chr

new file mode 100644 (file)

index 0000000..c802b61
--- /dev/null
+++ b/test/sort2/sort.chr
@@ -0,0 +1,34 @@
+# character map that removes some leading prefixes
+#
+# $Id: sort.chr,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $
+
+# Define the basic value-set. *Beware* of changing this without re-indexing
+# your databases.
+
+lowercase {0-9}{a-y}üzæäøöå
+uppercase {0-9}{A-Y}ÜZÆÄØÖÅ
+
+# Breaking characters
+
+space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~
+
+# Characters to be considered equivalent for searching purposes.
+
+# equivalent æä(ae)
+# equivalent øö(oe)
+# equivalent å(aa)
+# equivalent uü
+
+map (^The\s)   @
+map (^the\s)   @
+map (^a\s)      @
+map (^A\s)     @
+
+#map éÉ                e
+#map á         a
+#map ó         o
+#map í         i
+
+#map (Aa)      (AA)
+
+#map (aa)        a
diff --git a/test/sort2/test1.sh b/test/sort2/test1.sh

new file mode 100755 (executable)

index 0000000..604aed0
--- /dev/null
+++ b/test/sort2/test1.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+# $Id: test1.sh,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+
+pp=${srcdir:-"."}
+
+ulimit -c 10000
+LOG=test1.log
+rm -f $LOG
+rm -fr lock
+mkdir lock
+rm -fr reg
+mkdir reg
+rm -fr recs
+mkdir recs
+cp $pp/rec*.xml recs
+../../index/zebraidx -c $pp/zebra.cfg -l $LOG update recs || exit 1
+../../index/zebrasrv -c $pp/zebra.cfg -l $LOG unix:socket &
+sleep 1
+test -f lock/zebrasrv.pid || exit 2
+../api/testclient -n4 unix:socket '@or computer @attr 7=1 @attr 1=4 0' >tmp1
+
+kill `cat lock/zebrasrv.pid`
+
+echo 'Result count: 4
+my:
+  title: first computer
+my:
+  title: the fourth computer
+my:
+  title: second computer
+my:
+  title: A third computer' >tmp2
+
+diff tmp1 tmp2
diff --git a/test/sort2/zebra.cfg b/test/sort2/zebra.cfg

new file mode 100644 (file)

index 0000000..a495694
--- /dev/null
+++ b/test/sort2/zebra.cfg
@@ -0,0 +1,14 @@
+# Simple Zebra configuration file
+# $Id: zebra.cfg,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $
+#
+# Where the schema files, attribute files, etc are located.
+profilePath: ${srcdir:-.}:${srcdir:-.}/../../tab
+
+# Files that describe the attribute sets supported.
+attset: bib1.att
+attset: explain.att
+
+recordtype.xml: grs.sgml
+lockdir: lock
+register: reg:20M
+isam: b
diff --git a/util/charmap.c b/util/charmap.c

index a4f834c..898f133 100644 (file)
--- a/util/charmap.c
+++ b/util/charmap.c
@@ -1,4 +1,4 @@
-/* $Id: charmap.c,v 1.29.2.1 2004-08-06 10:08:19 adam Exp $
+/* $Id: charmap.c,v 1.29.2.2 2004-09-16 14:07:51 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -40,6 +40,8 @@ typedef unsigned ucs4_t;
  #define CHR_MAXSTR 1024
  #define CHR_MAXEQUIV 32
  
+const unsigned char CHR_FIELD_BEGIN = '^';
+
  const char *CHR_UNKNOWN = "\001";
  const char *CHR_SPACE   = "\002";
  const char *CHR_BASE    = "\003";
@@ -142,7 +144,7 @@ static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len)
      return t->target ? t : 0;
  }
  
-static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
+static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len, int first)
  {
      chr_t_entry *res;
  
@@ -153,35 +155,49 @@ static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
         from++;
         len++;
      }
-    if (*len > 0 && t->children && t->children[(unsigned char) **from])
+    if (*len > 0 && t->children)
      {
         const char *old_from = *from;
         int old_len = *len;
+
+       res = 0;
+
+       if (first && t->children[CHR_FIELD_BEGIN])
+       {
+           if ((res = find_entry_x(t->children[CHR_FIELD_BEGIN], from, len, 0)) && res != t->children[CHR_FIELD_BEGIN])
+               return res;
+            else
+               res = 0;
+           /* otherwhise there was no match on beginning of field, move on */
+       } 
         
-       (*len)--;
-       (*from)++;
-       if ((res = find_entry_x(t->children[(unsigned char) *old_from],
-                               from, len)))
-           return res;
-       /* no match */
-       *len = old_len;
-       *from = old_from;
+       if (!res && t->children[(unsigned char) **from])
+       {
+           (*len)--;
+           (*from)++;
+           if ((res = find_entry_x(t->children[(unsigned char) *old_from],
+                                   from, len, 0)))
+               return res;
+           /* no match */
+           *len = old_len;
+           *from = old_from;
+       }
      }
      /* no children match. use ourselves, if we have a target */
      return t->target ? t : 0;
  }
  
-const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len)
+const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len, int first)
  {
      chr_t_entry *t = maptab->input;
      chr_t_entry *res;
  
-    if (!(res = find_entry_x(t, from, len)))
+    if (!(res = find_entry_x(t, from, len, first)))
         abort();
      return (const char **) (res->target);
  }
  
-const char **chr_map_input(chrmaptab maptab, const char **from, int len)
+const char **chr_map_input(chrmaptab maptab, const char **from, int len, int first)
  {
      chr_t_entry *t = maptab->input;
      chr_t_entry *res;
@@ -189,7 +205,7 @@ const char **chr_map_input(chrmaptab maptab, const char **from, int len)
  
      len_tmp[0] = len;
      len_tmp[1] = -1;
-    if (!(res = find_entry_x(t, from, len_tmp)))
+    if (!(res = find_entry_x(t, from, len_tmp, first)))
         abort();
      return (const char **) (res->target);
  }
@@ -259,7 +275,7 @@ ucs4_t zebra_prim_w(ucs4_t **s)
      ucs4_t i = 0;
      char fmtstr[8];
  
-    yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s);
+    yaz_log (LOG_DEBUG, "prim_w %.3s", (char *) *s);
      if (**s == '\\')
      {
         (*s)++;
@@ -374,7 +390,7 @@ static void fun_mkstring(const char *s, void *data, int num)
      chrwork *arg = (chrwork *) data;
      const char **res, *p = s;
  
-    res = chr_map_input(arg->map, &s, strlen(s));
+    res = chr_map_input(arg->map, &s, strlen(s), 0);
      if (*res == (char*) CHR_UNKNOWN)
         logf(LOG_WARN, "Map: '%s' has no mapping", p);
      strncat(arg->string, *res, CHR_MAXSTR - strlen(arg->string));
@@ -443,6 +459,7 @@ static int scan_string(char *s_native,
      char str[1024];
  
      ucs4_t arg[512];
+    ucs4_t arg_prim[512];
      ucs4_t *s0, *s = arg;
      ucs4_t c, begin, end;
      size_t i;
@@ -498,11 +515,11 @@ static int scan_string(char *s_native,
         case '[': s++; abort(); break;
         case '(':
              ++s;
-            s0 = s;
-            while (*s != ')' || s[-1] == '\\')
-                s++;
-           *s = 0;
-            if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1))
+           s0 = s; i = 0;
+           while (*s != ')' || s[-1] == '\\')
+               arg_prim[i++] = zebra_prim_w(&s);
+           arg_prim[i] = 0;
+            if (scan_to_utf8 (t_utf8, arg_prim, zebra_ucs4_strlen(arg_prim), str, sizeof(str)-1))
                  return -1;
             (*fun)(str, data, num ? (*num)++ : 0);
             s++;
diff --git a/util/zebramap.c b/util/zebramap.c

index 0d1cf07..c1983e3 100644 (file)
--- a/util/zebramap.c
+++ b/util/zebramap.c
@@ -1,4 +1,4 @@
-/* $Id: zebramap.c,v 1.32 2004-06-16 20:30:47 adam Exp $
+/* $Id: zebramap.c,v 1.32.2.1 2004-09-16 14:07:51 adam Exp $
     Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
     Index Data Aps
  
@@ -291,13 +291,13 @@ chrmaptab zebra_charmap_get (ZebraMaps zms, unsigned reg_id)
  }
  
  const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id,
-                              const char **from, int len)
+                              const char **from, int len, int first)
  {
      chrmaptab maptab;
  
      maptab = zebra_charmap_get (zms, reg_id);
      if (maptab)
-       return chr_map_input(maptab, from, len);
+       return chr_map_input(maptab, from, len, first);
      
      zms->temp_map_str[0] = **from;
author	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 16 Sep 2004 14:07:48 +0000 (14:07 +0000)
configure.in		patch \| blob \| history
doc/recordmodel.xml		patch \| blob \| history
include/charmap.h		patch \| blob \| history
include/zebramap.h		patch \| blob \| history
index/extract.c		patch \| blob \| history
index/zrpn.c		patch \| blob \| history
tab/default.idx		patch \| blob \| history
tab/scan.chr		patch \| blob \| history
test/Makefile.am		patch \| blob \| history
test/sort2/Makefile.am	[new file with mode: 0644]	patch \| blob
test/sort2/default.idx	[new file with mode: 0644]	patch \| blob
test/sort2/my.abs	[new file with mode: 0644]	patch \| blob
test/sort2/rec1.xml	[new file with mode: 0644]	patch \| blob
test/sort2/rec2.xml	[new file with mode: 0644]	patch \| blob
test/sort2/rec3.xml	[new file with mode: 0644]	patch \| blob
test/sort2/rec4.xml	[new file with mode: 0644]	patch \| blob
test/sort2/sort.chr	[new file with mode: 0644]	patch \| blob
test/sort2/test1.sh	[new file with mode: 0755]	patch \| blob
test/sort2/zebra.cfg	[new file with mode: 0644]	patch \| blob
util/charmap.c		patch \| blob \| history
util/zebramap.c		patch \| blob \| history