1228060Sbapt/* Search algorithm.
2228060Sbapt   Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3228060Sbapt   Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4228060Sbapt   and Bruno Haible <bruno@clisp.org>.
5228060Sbapt
6228060Sbapt   This file is part of GNU GPERF.
7228060Sbapt
8228060Sbapt   GNU GPERF is free software; you can redistribute it and/or modify
9228060Sbapt   it under the terms of the GNU General Public License as published by
10228060Sbapt   the Free Software Foundation; either version 2, or (at your option)
11228060Sbapt   any later version.
12228060Sbapt
13228060Sbapt   GNU GPERF is distributed in the hope that it will be useful,
14228060Sbapt   but WITHOUT ANY WARRANTY; without even the implied warranty of
15228060Sbapt   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16228060Sbapt   GNU General Public License for more details.
17228060Sbapt
18228060Sbapt   You should have received a copy of the GNU General Public License
19228060Sbapt   along with this program; see the file COPYING.
20228060Sbapt   If not, write to the Free Software Foundation, Inc.,
21228060Sbapt   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
22228060Sbapt
23228060Sbapt/* Specification. */
24228060Sbapt#include "search.h"
25228060Sbapt
26228060Sbapt#include <stdio.h>
27228060Sbapt#include <stdlib.h> /* declares exit(), rand(), srand() */
28228060Sbapt#include <string.h> /* declares memset(), memcmp() */
29228060Sbapt#include <time.h> /* declares time() */
30228060Sbapt#include <math.h> /* declares exp() */
31228060Sbapt#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32228060Sbapt#include "options.h"
33228060Sbapt#include "hash-table.h"
34228060Sbapt#include "config.h"
35228060Sbapt
36228060Sbapt/* ============================== Portability ============================== */
37228060Sbapt
38228060Sbapt/* Assume ISO C++ 'for' scoping rule.  */
39257085Ssbruno/* This code is used to work around scoping issues with visual studio 6 from
40257085Ssbruno * 1998.  Comment it out here to queisce numerous -Wdangling-else warnings
41257085Ssbruno * from clang.
42257085Ssbruno#define for if (0) ; else for */
43228060Sbapt
44228060Sbapt/* Dynamically allocated array with dynamic extent:
45228060Sbapt
46228060Sbapt   Example:
47228060Sbapt       DYNAMIC_ARRAY (my_array, int, n);
48228060Sbapt       ...
49228060Sbapt       FREE_DYNAMIC_ARRAY (my_array);
50228060Sbapt
51228060Sbapt   Attention: depending on your implementation my_array is either the array
52228060Sbapt   itself or a pointer to the array! Always use my_array only as expression!
53228060Sbapt */
54228060Sbapt#if HAVE_DYNAMIC_ARRAY
55228060Sbapt  #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
56228060Sbapt  #define FREE_DYNAMIC_ARRAY(var)
57228060Sbapt#else
58228060Sbapt  #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
59228060Sbapt  #define FREE_DYNAMIC_ARRAY(var) delete[] var
60228060Sbapt#endif
61228060Sbapt
62228060Sbapt/* ================================ Theory ================================= */
63228060Sbapt
64228060Sbapt/* The general form of the hash function is
65228060Sbapt
66228060Sbapt      hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
67228060Sbapt                       + len (keyword)
68228060Sbapt
69228060Sbapt   where Pos is a set of byte positions,
70228060Sbapt   each alpha_inc[i] is a nonnegative integer,
71228060Sbapt   each asso_values[c] is a nonnegative integer,
72228060Sbapt   len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
73228060Sbapt
74228060Sbapt   Theorem 1: If all keywords are different, there is a set Pos such that
75228060Sbapt   all tuples (keyword[i] : i in Pos) are different.
76228060Sbapt
77228060Sbapt   Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
78228060Sbapt   are nonnegative integers alpha_inc[i] such that all multisets
79228060Sbapt   {keyword[i] + alpha_inc[i] : i in Pos} are different.
80228060Sbapt
81228060Sbapt   Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
82228060Sbapt
83228060Sbapt   Theorem 3: If all multisets selchars[keyword] are different, there are
84228060Sbapt   nonnegative integers asso_values[c] such that all hash values
85228060Sbapt   sum (asso_values[c] : c in selchars[keyword]) are different.
86228060Sbapt
87228060Sbapt   Based on these three facts, we find the hash function in three steps:
88228060Sbapt
89228060Sbapt   Step 1 (Finding good byte positions):
90228060Sbapt   Find a set Pos, as small as possible, such that all tuples
91228060Sbapt   (keyword[i] : i in Pos) are different.
92228060Sbapt
93228060Sbapt   Step 2 (Finding good alpha increments):
94228060Sbapt   Find nonnegative integers alpha_inc[i], as many of them as possible being
95228060Sbapt   zero, and the others being as small as possible, such that all multisets
96228060Sbapt   {keyword[i] + alpha_inc[i] : i in Pos} are different.
97228060Sbapt
98228060Sbapt   Step 3 (Finding good asso_values):
99228060Sbapt   Find asso_values[c] such that all hash (keyword) are different.
100228060Sbapt
101228060Sbapt   In other words, each step finds a projection that is injective on the
102228060Sbapt   given finite set:
103228060Sbapt     proj1 : String --> Map (Pos --> N)
104228060Sbapt     proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
105228060Sbapt     proj3 : Map (Pos --> N) / S(Pos) --> N
106228060Sbapt   where
107228060Sbapt     N denotes the set of nonnegative integers,
108228060Sbapt     Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
109228060Sbapt     S(Pos) is the symmetric group over Pos.
110228060Sbapt
111228060Sbapt   This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
112228060Sbapt   modifications apply:
113228060Sbapt     proj1 : String --> Map (Pos --> N) x N
114228060Sbapt     proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
115228060Sbapt     proj3 : Map (Pos --> N) / S(Pos) x N --> N
116228060Sbapt
117228060Sbapt   For a case-insensitive hash function, the general form is
118228060Sbapt
119228060Sbapt      hash (keyword) =
120228060Sbapt        sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
121228060Sbapt        + len (keyword)
122228060Sbapt
123228060Sbapt   where alpha_unify[c] is chosen so that an upper/lower case change in
124228060Sbapt   keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
125228060Sbapt */
126228060Sbapt
127228060Sbapt/* ==================== Initialization and Preparation ===================== */
128228060Sbapt
129228060SbaptSearch::Search (KeywordExt_List *list)
130228060Sbapt  : _head (list)
131228060Sbapt{
132228060Sbapt}
133228060Sbapt
134228060Sbaptvoid
135228060SbaptSearch::prepare ()
136228060Sbapt{
137228060Sbapt  /* Compute the total number of keywords.  */
138228060Sbapt  _total_keys = 0;
139228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
140228060Sbapt    _total_keys++;
141228060Sbapt
142228060Sbapt  /* Compute the minimum and maximum keyword length.  */
143228060Sbapt  _max_key_len = INT_MIN;
144228060Sbapt  _min_key_len = INT_MAX;
145228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
146228060Sbapt    {
147228060Sbapt      KeywordExt *keyword = temp->first();
148228060Sbapt
149228060Sbapt      if (_max_key_len < keyword->_allchars_length)
150228060Sbapt        _max_key_len = keyword->_allchars_length;
151228060Sbapt      if (_min_key_len > keyword->_allchars_length)
152228060Sbapt        _min_key_len = keyword->_allchars_length;
153228060Sbapt    }
154228060Sbapt
155228060Sbapt  /* Exit program if an empty string is used as keyword, since the comparison
156228060Sbapt     expressions don't work correctly for looking up an empty string.  */
157228060Sbapt  if (_min_key_len == 0)
158228060Sbapt    {
159228060Sbapt      fprintf (stderr, "Empty input keyword is not allowed.\n"
160228060Sbapt               "To recognize an empty input keyword, your code should check for\n"
161228060Sbapt               "len == 0 before calling the gperf generated lookup function.\n");
162228060Sbapt      exit (1);
163228060Sbapt    }
164228060Sbapt
165228060Sbapt  /* Exit program if the characters in the keywords are not in the required
166228060Sbapt     range.  */
167228060Sbapt  if (option[SEVENBIT])
168228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
169228060Sbapt      {
170228060Sbapt        KeywordExt *keyword = temp->first();
171228060Sbapt
172228060Sbapt        const char *k = keyword->_allchars;
173228060Sbapt        for (int i = keyword->_allchars_length; i > 0; k++, i--)
174228060Sbapt          if (!(static_cast<unsigned char>(*k) < 128))
175228060Sbapt            {
176228060Sbapt              fprintf (stderr, "Option --seven-bit has been specified,\n"
177228060Sbapt                       "but keyword \"%.*s\" contains non-ASCII characters.\n"
178228060Sbapt                       "Try removing option --seven-bit.\n",
179228060Sbapt                       keyword->_allchars_length, keyword->_allchars);
180228060Sbapt              exit (1);
181228060Sbapt            }
182228060Sbapt      }
183228060Sbapt}
184228060Sbapt
185228060Sbapt/* ====================== Finding good byte positions ====================== */
186228060Sbapt
187228060Sbapt/* Computes the upper bound on the indices passed to asso_values[],
188228060Sbapt   assuming no alpha_increments.  */
189228060Sbaptunsigned int
190228060SbaptSearch::compute_alpha_size () const
191228060Sbapt{
192228060Sbapt  return (option[SEVENBIT] ? 128 : 256);
193228060Sbapt}
194228060Sbapt
195228060Sbapt/* Computes the unification rules between different asso_values[c],
196228060Sbapt   assuming no alpha_increments.  */
197228060Sbaptunsigned int *
198228060SbaptSearch::compute_alpha_unify () const
199228060Sbapt{
200228060Sbapt  if (option[UPPERLOWER])
201228060Sbapt    {
202228060Sbapt      /* Uppercase to lowercase mapping.  */
203228060Sbapt      unsigned int alpha_size = compute_alpha_size();
204228060Sbapt      unsigned int *alpha_unify = new unsigned int[alpha_size];
205228060Sbapt      for (unsigned int c = 0; c < alpha_size; c++)
206228060Sbapt        alpha_unify[c] = c;
207228060Sbapt      for (unsigned int c = 'A'; c <= 'Z'; c++)
208228060Sbapt        alpha_unify[c] = c + ('a'-'A');
209228060Sbapt      return alpha_unify;
210228060Sbapt    }
211228060Sbapt  else
212228060Sbapt    /* Identity mapping.  */
213228060Sbapt    return NULL;
214228060Sbapt}
215228060Sbapt
216228060Sbapt/* Initializes each keyword's _selchars array.  */
217228060Sbaptvoid
218228060SbaptSearch::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
219228060Sbapt{
220228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221228060Sbapt    temp->first()->init_selchars_tuple(positions, alpha_unify);
222228060Sbapt}
223228060Sbapt
224228060Sbapt/* Deletes each keyword's _selchars array.  */
225228060Sbaptvoid
226228060SbaptSearch::delete_selchars () const
227228060Sbapt{
228228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
229228060Sbapt    temp->first()->delete_selchars();
230228060Sbapt}
231228060Sbapt
232228060Sbapt/* Count the duplicate keywords that occur with a given set of positions.
233228060Sbapt   In other words, it returns the difference
234228060Sbapt     # K - # proj1 (K)
235228060Sbapt   where K is the multiset of given keywords.  */
236228060Sbaptunsigned int
237228060SbaptSearch::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
238228060Sbapt{
239228060Sbapt  /* Run through the keyword list and count the duplicates incrementally.
240228060Sbapt     The result does not depend on the order of the keyword list, thanks to
241228060Sbapt     the formula above.  */
242228060Sbapt  init_selchars_tuple (positions, alpha_unify);
243228060Sbapt
244228060Sbapt  unsigned int count = 0;
245228060Sbapt  {
246228060Sbapt    Hash_Table representatives (_total_keys, option[NOLENGTH]);
247228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
248228060Sbapt      {
249228060Sbapt        KeywordExt *keyword = temp->first();
250228060Sbapt        if (representatives.insert (keyword))
251228060Sbapt          count++;
252228060Sbapt      }
253228060Sbapt  }
254228060Sbapt
255228060Sbapt  delete_selchars ();
256228060Sbapt
257228060Sbapt  return count;
258228060Sbapt}
259228060Sbapt
260228060Sbapt/* Find good key positions.  */
261228060Sbapt
262228060Sbaptvoid
263228060SbaptSearch::find_positions ()
264228060Sbapt{
265228060Sbapt  /* If the user gave the key positions, we use them.  */
266228060Sbapt  if (option[POSITIONS])
267228060Sbapt    {
268228060Sbapt      _key_positions = option.get_key_positions();
269228060Sbapt      return;
270228060Sbapt    }
271228060Sbapt
272228060Sbapt  /* Compute preliminary alpha_unify table.  */
273228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify ();
274228060Sbapt
275228060Sbapt  /* 1. Find positions that must occur in order to distinguish duplicates.  */
276228060Sbapt  Positions mandatory;
277228060Sbapt
278228060Sbapt  if (!option[DUP])
279228060Sbapt    {
280228060Sbapt      for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
281228060Sbapt        {
282228060Sbapt          KeywordExt *keyword1 = l1->first();
283228060Sbapt          for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
284228060Sbapt            {
285228060Sbapt              KeywordExt *keyword2 = l2->first();
286228060Sbapt
287228060Sbapt              /* If keyword1 and keyword2 have the same length and differ
288228060Sbapt                 in just one position, and it is not the last character,
289228060Sbapt                 this position is mandatory.  */
290228060Sbapt              if (keyword1->_allchars_length == keyword2->_allchars_length)
291228060Sbapt                {
292228060Sbapt                  int n = keyword1->_allchars_length;
293228060Sbapt                  int i;
294228060Sbapt                  for (i = 0; i < n - 1; i++)
295228060Sbapt                    {
296228060Sbapt                      unsigned char c1 = keyword1->_allchars[i];
297228060Sbapt                      unsigned char c2 = keyword2->_allchars[i];
298228060Sbapt                      if (option[UPPERLOWER])
299228060Sbapt                        {
300228060Sbapt                          if (c1 >= 'A' && c1 <= 'Z')
301228060Sbapt                            c1 += 'a' - 'A';
302228060Sbapt                          if (c2 >= 'A' && c2 <= 'Z')
303228060Sbapt                            c2 += 'a' - 'A';
304228060Sbapt                        }
305228060Sbapt                      if (c1 != c2)
306228060Sbapt                        break;
307228060Sbapt                    }
308228060Sbapt                  if (i < n - 1)
309228060Sbapt                    {
310228060Sbapt                      int j;
311228060Sbapt                      for (j = i + 1; j < n; j++)
312228060Sbapt                        {
313228060Sbapt                          unsigned char c1 = keyword1->_allchars[j];
314228060Sbapt                          unsigned char c2 = keyword2->_allchars[j];
315228060Sbapt                          if (option[UPPERLOWER])
316228060Sbapt                            {
317228060Sbapt                              if (c1 >= 'A' && c1 <= 'Z')
318228060Sbapt                                c1 += 'a' - 'A';
319228060Sbapt                              if (c2 >= 'A' && c2 <= 'Z')
320228060Sbapt                                c2 += 'a' - 'A';
321228060Sbapt                            }
322228060Sbapt                          if (c1 != c2)
323228060Sbapt                            break;
324228060Sbapt                        }
325228060Sbapt                      if (j >= n)
326228060Sbapt                        {
327228060Sbapt                          /* Position i is mandatory.  */
328228060Sbapt                          if (!mandatory.contains (i))
329228060Sbapt                            mandatory.add (i);
330228060Sbapt                        }
331228060Sbapt                    }
332228060Sbapt                }
333228060Sbapt            }
334228060Sbapt        }
335228060Sbapt    }
336228060Sbapt
337228060Sbapt  /* 2. Add positions, as long as this decreases the duplicates count.  */
338228060Sbapt  int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
339228060Sbapt              ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
340228060Sbapt  Positions current = mandatory;
341228060Sbapt  unsigned int current_duplicates_count =
342228060Sbapt    count_duplicates_tuple (current, alpha_unify);
343228060Sbapt  for (;;)
344228060Sbapt    {
345228060Sbapt      Positions best;
346228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
347228060Sbapt
348228060Sbapt      for (int i = imax; i >= -1; i--)
349228060Sbapt        if (!current.contains (i))
350228060Sbapt          {
351228060Sbapt            Positions tryal = current;
352228060Sbapt            tryal.add (i);
353228060Sbapt            unsigned int try_duplicates_count =
354228060Sbapt              count_duplicates_tuple (tryal, alpha_unify);
355228060Sbapt
356228060Sbapt            /* We prefer 'try' to 'best' if it produces less duplicates,
357228060Sbapt               or if it produces the same number of duplicates but with
358228060Sbapt               a more efficient hash function.  */
359228060Sbapt            if (try_duplicates_count < best_duplicates_count
360228060Sbapt                || (try_duplicates_count == best_duplicates_count && i >= 0))
361228060Sbapt              {
362228060Sbapt                best = tryal;
363228060Sbapt                best_duplicates_count = try_duplicates_count;
364228060Sbapt              }
365228060Sbapt          }
366228060Sbapt
367228060Sbapt      /* Stop adding positions when it gives no improvement.  */
368228060Sbapt      if (best_duplicates_count >= current_duplicates_count)
369228060Sbapt        break;
370228060Sbapt
371228060Sbapt      current = best;
372228060Sbapt      current_duplicates_count = best_duplicates_count;
373228060Sbapt    }
374228060Sbapt
375228060Sbapt  /* 3. Remove positions, as long as this doesn't increase the duplicates
376228060Sbapt     count.  */
377228060Sbapt  for (;;)
378228060Sbapt    {
379228060Sbapt      Positions best;
380228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
381228060Sbapt
382228060Sbapt      for (int i = imax; i >= -1; i--)
383228060Sbapt        if (current.contains (i) && !mandatory.contains (i))
384228060Sbapt          {
385228060Sbapt            Positions tryal = current;
386228060Sbapt            tryal.remove (i);
387228060Sbapt            unsigned int try_duplicates_count =
388228060Sbapt              count_duplicates_tuple (tryal, alpha_unify);
389228060Sbapt
390228060Sbapt            /* We prefer 'try' to 'best' if it produces less duplicates,
391228060Sbapt               or if it produces the same number of duplicates but with
392228060Sbapt               a more efficient hash function.  */
393228060Sbapt            if (try_duplicates_count < best_duplicates_count
394228060Sbapt                || (try_duplicates_count == best_duplicates_count && i == -1))
395228060Sbapt              {
396228060Sbapt                best = tryal;
397228060Sbapt                best_duplicates_count = try_duplicates_count;
398228060Sbapt              }
399228060Sbapt          }
400228060Sbapt
401228060Sbapt      /* Stop removing positions when it gives no improvement.  */
402228060Sbapt      if (best_duplicates_count > current_duplicates_count)
403228060Sbapt        break;
404228060Sbapt
405228060Sbapt      current = best;
406228060Sbapt      current_duplicates_count = best_duplicates_count;
407228060Sbapt    }
408228060Sbapt
409228060Sbapt  /* 4. Replace two positions by one, as long as this doesn't increase the
410228060Sbapt     duplicates count.  */
411228060Sbapt  for (;;)
412228060Sbapt    {
413228060Sbapt      Positions best;
414228060Sbapt      unsigned int best_duplicates_count = UINT_MAX;
415228060Sbapt
416228060Sbapt      for (int i1 = imax; i1 >= -1; i1--)
417228060Sbapt        if (current.contains (i1) && !mandatory.contains (i1))
418228060Sbapt          for (int i2 = imax; i2 >= -1; i2--)
419228060Sbapt            if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
420228060Sbapt              for (int i3 = imax; i3 >= 0; i3--)
421228060Sbapt                if (!current.contains (i3))
422228060Sbapt                  {
423228060Sbapt                    Positions tryal = current;
424228060Sbapt                    tryal.remove (i1);
425228060Sbapt                    tryal.remove (i2);
426228060Sbapt                    tryal.add (i3);
427228060Sbapt                    unsigned int try_duplicates_count =
428228060Sbapt                      count_duplicates_tuple (tryal, alpha_unify);
429228060Sbapt
430228060Sbapt                    /* We prefer 'try' to 'best' if it produces less duplicates,
431228060Sbapt                       or if it produces the same number of duplicates but with
432228060Sbapt                       a more efficient hash function.  */
433228060Sbapt                    if (try_duplicates_count < best_duplicates_count
434228060Sbapt                        || (try_duplicates_count == best_duplicates_count
435228060Sbapt                            && (i1 == -1 || i2 == -1 || i3 >= 0)))
436228060Sbapt                      {
437228060Sbapt                        best = tryal;
438228060Sbapt                        best_duplicates_count = try_duplicates_count;
439228060Sbapt                      }
440228060Sbapt                  }
441228060Sbapt
442228060Sbapt      /* Stop removing positions when it gives no improvement.  */
443228060Sbapt      if (best_duplicates_count > current_duplicates_count)
444228060Sbapt        break;
445228060Sbapt
446228060Sbapt      current = best;
447228060Sbapt      current_duplicates_count = best_duplicates_count;
448228060Sbapt    }
449228060Sbapt
450228060Sbapt  /* That's it.  Hope it's good enough.  */
451228060Sbapt  _key_positions = current;
452228060Sbapt
453228060Sbapt  if (option[DEBUG])
454228060Sbapt    {
455228060Sbapt      /* Print the result.  */
456228060Sbapt      fprintf (stderr, "\nComputed positions: ");
457228060Sbapt      PositionReverseIterator iter = _key_positions.reviterator();
458228060Sbapt      bool seen_lastchar = false;
459228060Sbapt      bool first = true;
460228060Sbapt      for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
461228060Sbapt        {
462228060Sbapt          if (!first)
463228060Sbapt            fprintf (stderr, ", ");
464228060Sbapt          if (i == Positions::LASTCHAR)
465228060Sbapt            seen_lastchar = true;
466228060Sbapt          else
467228060Sbapt            {
468228060Sbapt              fprintf (stderr, "%d", i + 1);
469228060Sbapt              first = false;
470228060Sbapt            }
471228060Sbapt        }
472228060Sbapt      if (seen_lastchar)
473228060Sbapt        {
474228060Sbapt          if (!first)
475228060Sbapt            fprintf (stderr, ", ");
476228060Sbapt          fprintf (stderr, "$");
477228060Sbapt        }
478228060Sbapt      fprintf (stderr, "\n");
479228060Sbapt    }
480228060Sbapt
481228060Sbapt  /* Free preliminary alpha_unify table.  */
482228060Sbapt  delete[] alpha_unify;
483228060Sbapt}
484228060Sbapt
485228060Sbapt/* Count the duplicate keywords that occur with the found set of positions.
486228060Sbapt   In other words, it returns the difference
487228060Sbapt     # K - # proj1 (K)
488228060Sbapt   where K is the multiset of given keywords.  */
489228060Sbaptunsigned int
490228060SbaptSearch::count_duplicates_tuple () const
491228060Sbapt{
492228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify ();
493228060Sbapt  unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
494228060Sbapt  delete[] alpha_unify;
495228060Sbapt  return count;
496228060Sbapt}
497228060Sbapt
498228060Sbapt/* ===================== Finding good alpha increments ===================== */
499228060Sbapt
500228060Sbapt/* Computes the upper bound on the indices passed to asso_values[].  */
501228060Sbaptunsigned int
502228060SbaptSearch::compute_alpha_size (const unsigned int *alpha_inc) const
503228060Sbapt{
504228060Sbapt  unsigned int max_alpha_inc = 0;
505228060Sbapt  for (int i = 0; i < _max_key_len; i++)
506228060Sbapt    if (max_alpha_inc < alpha_inc[i])
507228060Sbapt      max_alpha_inc = alpha_inc[i];
508228060Sbapt  return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
509228060Sbapt}
510228060Sbapt
511228060Sbapt/* Computes the unification rules between different asso_values[c].  */
512228060Sbaptunsigned int *
513228060SbaptSearch::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
514228060Sbapt{
515228060Sbapt  if (option[UPPERLOWER])
516228060Sbapt    {
517228060Sbapt      /* Without alpha increments, we would simply unify
518228060Sbapt           'A' -> 'a', ..., 'Z' -> 'z'.
519228060Sbapt         But when a keyword contains at position i a character c,
520228060Sbapt         we have the constraint
521228060Sbapt            asso_values[tolower(c) + alpha_inc[i]] ==
522228060Sbapt            asso_values[toupper(c) + alpha_inc[i]].
523228060Sbapt         This introduces a unification
524228060Sbapt           toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
525228060Sbapt         Note that this unification can extend outside the range of
526228060Sbapt         ASCII letters!  But still every unified character pair is at
527228060Sbapt         a distance of 'a'-'A' = 32, or (after chained unification)
528228060Sbapt         at a multiple of 32.  So in the end the alpha_unify vector has
529228060Sbapt         the form    c -> c + 32 * f(c)   where f(c) is a nonnegative
530228060Sbapt         integer.  */
531228060Sbapt      unsigned int alpha_size = compute_alpha_size (alpha_inc);
532228060Sbapt
533228060Sbapt      unsigned int *alpha_unify = new unsigned int[alpha_size];
534228060Sbapt      for (unsigned int c = 0; c < alpha_size; c++)
535228060Sbapt        alpha_unify[c] = c;
536228060Sbapt
537228060Sbapt      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
538228060Sbapt        {
539228060Sbapt          KeywordExt *keyword = temp->first();
540228060Sbapt
541228060Sbapt          /* Iterate through the selected character positions.  */
542228060Sbapt          PositionIterator iter = positions.iterator(keyword->_allchars_length);
543228060Sbapt
544228060Sbapt          for (int i; (i = iter.next ()) != PositionIterator::EOS; )
545228060Sbapt            {
546228060Sbapt              unsigned int c;
547228060Sbapt              if (i == Positions::LASTCHAR)
548228060Sbapt                c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
549228060Sbapt              else if (i < keyword->_allchars_length)
550228060Sbapt                c = static_cast<unsigned char>(keyword->_allchars[i]);
551228060Sbapt              else
552228060Sbapt                abort ();
553228060Sbapt              if (c >= 'A' && c <= 'Z')
554228060Sbapt                c += 'a' - 'A';
555228060Sbapt              if (c >= 'a' && c <= 'z')
556228060Sbapt                {
557228060Sbapt                  if (i != Positions::LASTCHAR)
558228060Sbapt                    c += alpha_inc[i];
559228060Sbapt                  /* Unify c with c - ('a'-'A').  */
560228060Sbapt                  unsigned int d = alpha_unify[c];
561228060Sbapt                  unsigned int b = c - ('a'-'A');
562228060Sbapt                  for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
563228060Sbapt                    alpha_unify[a] = d;
564228060Sbapt                }
565228060Sbapt            }
566228060Sbapt        }
567228060Sbapt      return alpha_unify;
568228060Sbapt    }
569228060Sbapt  else
570228060Sbapt    /* Identity mapping.  */
571228060Sbapt    return NULL;
572228060Sbapt}
573228060Sbapt
574228060Sbapt/* Initializes each keyword's _selchars array.  */
575228060Sbaptvoid
576228060SbaptSearch::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
577228060Sbapt{
578228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
579228060Sbapt    temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
580228060Sbapt}
581228060Sbapt
582228060Sbapt/* Count the duplicate keywords that occur with the given set of positions
583228060Sbapt   and a given alpha_inc[] array.
584228060Sbapt   In other words, it returns the difference
585228060Sbapt     # K - # proj2 (proj1 (K))
586228060Sbapt   where K is the multiset of given keywords.  */
587228060Sbaptunsigned int
588228060SbaptSearch::count_duplicates_multiset (const unsigned int *alpha_inc) const
589228060Sbapt{
590228060Sbapt  /* Run through the keyword list and count the duplicates incrementally.
591228060Sbapt     The result does not depend on the order of the keyword list, thanks to
592228060Sbapt     the formula above.  */
593228060Sbapt  unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
594228060Sbapt  init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
595228060Sbapt
596228060Sbapt  unsigned int count = 0;
597228060Sbapt  {
598228060Sbapt    Hash_Table representatives (_total_keys, option[NOLENGTH]);
599228060Sbapt    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
600228060Sbapt      {
601228060Sbapt        KeywordExt *keyword = temp->first();
602228060Sbapt        if (representatives.insert (keyword))
603228060Sbapt          count++;
604228060Sbapt      }
605228060Sbapt  }
606228060Sbapt
607228060Sbapt  delete_selchars ();
608228060Sbapt  delete[] alpha_unify;
609228060Sbapt
610228060Sbapt  return count;
611228060Sbapt}
612228060Sbapt
613228060Sbapt/* Find good _alpha_inc[].  */
614228060Sbapt
615228060Sbaptvoid
616228060SbaptSearch::find_alpha_inc ()
617228060Sbapt{
618228060Sbapt  /* The goal is to choose _alpha_inc[] such that it doesn't introduce
619228060Sbapt     artificial duplicates.
620228060Sbapt     In other words, the goal is  # proj2 (proj1 (K)) = # proj1 (K).  */
621228060Sbapt  unsigned int duplicates_goal = count_duplicates_tuple ();
622228060Sbapt
623228060Sbapt  /* Start with zero increments.  This is sufficient in most cases.  */
624228060Sbapt  unsigned int *current = new unsigned int [_max_key_len];
625228060Sbapt  for (int i = 0; i < _max_key_len; i++)
626228060Sbapt    current[i] = 0;
627228060Sbapt  unsigned int current_duplicates_count = count_duplicates_multiset (current);
628228060Sbapt
629228060Sbapt  if (current_duplicates_count > duplicates_goal)
630228060Sbapt    {
631228060Sbapt      /* Look which _alpha_inc[i] we are free to increment.  */
632228060Sbapt      unsigned int nindices;
633228060Sbapt      {
634228060Sbapt        nindices = 0;
635228060Sbapt        PositionIterator iter = _key_positions.iterator(_max_key_len);
636228060Sbapt        for (;;)
637228060Sbapt          {
638228060Sbapt            int key_pos = iter.next ();
639228060Sbapt            if (key_pos == PositionIterator::EOS)
640228060Sbapt              break;
641228060Sbapt            if (key_pos != Positions::LASTCHAR)
642228060Sbapt              nindices++;
643228060Sbapt          }
644228060Sbapt      }
645228060Sbapt
646228060Sbapt      DYNAMIC_ARRAY (indices, unsigned int, nindices);
647228060Sbapt      {
648228060Sbapt        unsigned int j = 0;
649228060Sbapt        PositionIterator iter = _key_positions.iterator(_max_key_len);
650228060Sbapt        for (;;)
651228060Sbapt          {
652228060Sbapt            int key_pos = iter.next ();
653228060Sbapt            if (key_pos == PositionIterator::EOS)
654228060Sbapt              break;
655228060Sbapt            if (key_pos != Positions::LASTCHAR)
656228060Sbapt              indices[j++] = key_pos;
657228060Sbapt          }
658228060Sbapt        if (!(j == nindices))
659228060Sbapt          abort ();
660228060Sbapt      }
661228060Sbapt
662228060Sbapt      /* Perform several rounds of searching for a good alpha increment.
663228060Sbapt         Each round reduces the number of artificial collisions by adding
664228060Sbapt         an increment in a single key position.  */
665228060Sbapt      DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
666228060Sbapt      DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
667228060Sbapt      do
668228060Sbapt        {
669228060Sbapt          /* An increment of 1 is not always enough.  Try higher increments
670228060Sbapt             also.  */
671228060Sbapt          for (unsigned int inc = 1; ; inc++)
672228060Sbapt            {
673228060Sbapt              unsigned int best_duplicates_count = UINT_MAX;
674228060Sbapt
675228060Sbapt              for (unsigned int j = 0; j < nindices; j++)
676228060Sbapt                {
677228060Sbapt                  memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
678228060Sbapt                  tryal[indices[j]] += inc;
679228060Sbapt                  unsigned int try_duplicates_count =
680228060Sbapt                    count_duplicates_multiset (tryal);
681228060Sbapt
682228060Sbapt                  /* We prefer 'try' to 'best' if it produces less
683228060Sbapt                     duplicates.  */
684228060Sbapt                  if (try_duplicates_count < best_duplicates_count)
685228060Sbapt                    {
686228060Sbapt                      memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
687228060Sbapt                      best_duplicates_count = try_duplicates_count;
688228060Sbapt                    }
689228060Sbapt                }
690228060Sbapt
691228060Sbapt              /* Stop this round when we got an improvement.  */
692228060Sbapt              if (best_duplicates_count < current_duplicates_count)
693228060Sbapt                {
694228060Sbapt                  memcpy (current, best, _max_key_len * sizeof (unsigned int));
695228060Sbapt                  current_duplicates_count = best_duplicates_count;
696228060Sbapt                  break;
697228060Sbapt                }
698228060Sbapt            }
699228060Sbapt        }
700228060Sbapt      while (current_duplicates_count > duplicates_goal);
701228060Sbapt      FREE_DYNAMIC_ARRAY (tryal);
702228060Sbapt      FREE_DYNAMIC_ARRAY (best);
703228060Sbapt
704228060Sbapt      if (option[DEBUG])
705228060Sbapt        {
706228060Sbapt          /* Print the result.  */
707228060Sbapt          fprintf (stderr, "\nComputed alpha increments: ");
708228060Sbapt          bool first = true;
709228060Sbapt          for (unsigned int j = nindices; j-- > 0; )
710228060Sbapt            if (current[indices[j]] != 0)
711228060Sbapt              {
712228060Sbapt                if (!first)
713228060Sbapt                  fprintf (stderr, ", ");
714228060Sbapt                fprintf (stderr, "%u:+%u",
715228060Sbapt                         indices[j] + 1, current[indices[j]]);
716228060Sbapt                first = false;
717228060Sbapt              }
718228060Sbapt          fprintf (stderr, "\n");
719228060Sbapt        }
720228060Sbapt      FREE_DYNAMIC_ARRAY (indices);
721228060Sbapt    }
722228060Sbapt
723228060Sbapt  _alpha_inc = current;
724228060Sbapt  _alpha_size = compute_alpha_size (_alpha_inc);
725228060Sbapt  _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
726228060Sbapt}
727228060Sbapt
728228060Sbapt/* ======================= Finding good asso_values ======================== */
729228060Sbapt
730228060Sbapt/* Initializes the asso_values[] related parameters.  */
731228060Sbapt
732228060Sbaptvoid
733228060SbaptSearch::prepare_asso_values ()
734228060Sbapt{
735228060Sbapt  KeywordExt_List *temp;
736228060Sbapt
737228060Sbapt  /* Initialize each keyword's _selchars array.  */
738228060Sbapt  init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
739228060Sbapt
740228060Sbapt  /* Compute the maximum _selchars_length over all keywords.  */
741228060Sbapt  _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
742228060Sbapt
743228060Sbapt  /* Check for duplicates, i.e. keywords with the same _selchars array
744228060Sbapt     (and - if !option[NOLENGTH] - also the same length).
745228060Sbapt     We deal with these by building an equivalence class, so that only
746228060Sbapt     1 keyword is representative of the entire collection.  Only this
747228060Sbapt     representative remains in the keyword list; the others are accessible
748228060Sbapt     through the _duplicate_link chain, starting at the representative.
749228060Sbapt     This *greatly* simplifies processing during later stages of the program.
750228060Sbapt     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
751228060Sbapt  {
752228060Sbapt    _list_len = _total_keys;
753228060Sbapt    _total_duplicates = 0;
754228060Sbapt    /* Make hash table for efficiency.  */
755228060Sbapt    Hash_Table representatives (_list_len, option[NOLENGTH]);
756228060Sbapt
757228060Sbapt    KeywordExt_List *prev = NULL; /* list node before temp */
758228060Sbapt    for (temp = _head; temp; )
759228060Sbapt      {
760228060Sbapt        KeywordExt *keyword = temp->first();
761228060Sbapt        KeywordExt *other_keyword = representatives.insert (keyword);
762228060Sbapt        KeywordExt_List *garbage = NULL;
763228060Sbapt
764228060Sbapt        if (other_keyword)
765228060Sbapt          {
766228060Sbapt            _total_duplicates++;
767228060Sbapt            _list_len--;
768228060Sbapt            /* Remove keyword from the main list.  */
769228060Sbapt            prev->rest() = temp->rest();
770228060Sbapt            garbage = temp;
771228060Sbapt            /* And insert it on other_keyword's duplicate list.  */
772228060Sbapt            keyword->_duplicate_link = other_keyword->_duplicate_link;
773228060Sbapt            other_keyword->_duplicate_link = keyword;
774228060Sbapt
775228060Sbapt            /* Complain if user hasn't enabled the duplicate option. */
776228060Sbapt            if (!option[DUP] || option[DEBUG])
777228060Sbapt              {
778228060Sbapt                fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
779228060Sbapt                         keyword->_allchars_length, keyword->_allchars,
780228060Sbapt                         other_keyword->_allchars_length, other_keyword->_allchars);
781228060Sbapt                for (int j = 0; j < keyword->_selchars_length; j++)
782228060Sbapt                  putc (keyword->_selchars[j], stderr);
783228060Sbapt                fprintf (stderr, "\".\n");
784228060Sbapt              }
785228060Sbapt          }
786228060Sbapt        else
787228060Sbapt          {
788228060Sbapt            keyword->_duplicate_link = NULL;
789228060Sbapt            prev = temp;
790228060Sbapt          }
791228060Sbapt        temp = temp->rest();
792228060Sbapt        if (garbage)
793228060Sbapt          delete garbage;
794228060Sbapt      }
795228060Sbapt    if (option[DEBUG])
796228060Sbapt      representatives.dump();
797228060Sbapt  }
798228060Sbapt
799228060Sbapt  /* Exit program if duplicates exists and option[DUP] not set, since we
800228060Sbapt     don't want to continue in this case.  (We don't want to turn on
801228060Sbapt     option[DUP] implicitly, because the generated code is usually much
802228060Sbapt     slower.  */
803228060Sbapt  if (_total_duplicates)
804228060Sbapt    {
805228060Sbapt      if (option[DUP])
806228060Sbapt        fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
807228060Sbapt                         _total_duplicates);
808228060Sbapt      else
809228060Sbapt        {
810228060Sbapt          fprintf (stderr, "%d input keys have identical hash values,\n",
811228060Sbapt                           _total_duplicates);
812228060Sbapt          if (option[POSITIONS])
813228060Sbapt            fprintf (stderr, "try different key positions or use option -D.\n");
814228060Sbapt          else
815228060Sbapt            fprintf (stderr, "use option -D.\n");
816228060Sbapt          exit (1);
817228060Sbapt        }
818228060Sbapt    }
819228060Sbapt
820228060Sbapt  /* Compute the occurrences of each character in the alphabet.  */
821228060Sbapt  _occurrences = new int[_alpha_size];
822228060Sbapt  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
823228060Sbapt  for (temp = _head; temp; temp = temp->rest())
824228060Sbapt    {
825228060Sbapt      KeywordExt *keyword = temp->first();
826228060Sbapt      const unsigned int *ptr = keyword->_selchars;
827228060Sbapt      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
828228060Sbapt        _occurrences[*ptr]++;
829228060Sbapt    }
830228060Sbapt
831228060Sbapt  /* Memory allocation.  */
832228060Sbapt  _asso_values = new int[_alpha_size];
833228060Sbapt
834228060Sbapt  int non_linked_length = _list_len;
835228060Sbapt  unsigned int asso_value_max;
836228060Sbapt
837228060Sbapt  asso_value_max =
838228060Sbapt    static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
839228060Sbapt  /* Round up to the next power of two.  This makes it easy to ensure
840228060Sbapt     an _asso_value[c] is >= 0 and < asso_value_max.  Also, the jump value
841228060Sbapt     being odd, it guarantees that Search::try_asso_value() will iterate
842228060Sbapt     through different values for _asso_value[c].  */
843228060Sbapt  if (asso_value_max == 0)
844228060Sbapt    asso_value_max = 1;
845228060Sbapt  asso_value_max |= asso_value_max >> 1;
846228060Sbapt  asso_value_max |= asso_value_max >> 2;
847228060Sbapt  asso_value_max |= asso_value_max >> 4;
848228060Sbapt  asso_value_max |= asso_value_max >> 8;
849228060Sbapt  asso_value_max |= asso_value_max >> 16;
850228060Sbapt  asso_value_max++;
851228060Sbapt  _asso_value_max = asso_value_max;
852228060Sbapt
853228060Sbapt  /* Given the bound for _asso_values[c], we have a bound for the possible
854228060Sbapt     hash values, as computed in compute_hash().  */
855228060Sbapt  _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
856228060Sbapt                    + (_asso_value_max - 1) * _max_selchars_length;
857228060Sbapt  /* Allocate a sparse bit vector for detection of collisions of hash
858228060Sbapt     values.  */
859228060Sbapt  _collision_detector = new Bool_Array (_max_hash_value + 1);
860228060Sbapt
861228060Sbapt  if (option[DEBUG])
862228060Sbapt    {
863228060Sbapt      fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
864228060Sbapt               "\nmaximum size of generated hash table is %d\n",
865228060Sbapt               non_linked_length, asso_value_max, _max_hash_value);
866228060Sbapt
867228060Sbapt      int field_width;
868228060Sbapt
869228060Sbapt      field_width = 0;
870228060Sbapt      {
871228060Sbapt        for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
872228060Sbapt          {
873228060Sbapt            KeywordExt *keyword = temp->first();
874228060Sbapt            if (field_width < keyword->_selchars_length)
875228060Sbapt              field_width = keyword->_selchars_length;
876228060Sbapt          }
877228060Sbapt      }
878228060Sbapt
879228060Sbapt      fprintf (stderr, "\ndumping the keyword list without duplicates\n");
880228060Sbapt      fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
881228060Sbapt      int i = 0;
882228060Sbapt      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
883228060Sbapt        {
884228060Sbapt          KeywordExt *keyword = temp->first();
885228060Sbapt          fprintf (stderr, "%9d, ", ++i);
886228060Sbapt          if (field_width > keyword->_selchars_length)
887228060Sbapt            fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
888228060Sbapt          for (int j = 0; j < keyword->_selchars_length; j++)
889228060Sbapt            putc (keyword->_selchars[j], stderr);
890228060Sbapt          fprintf (stderr, ", %.*s\n",
891228060Sbapt                   keyword->_allchars_length, keyword->_allchars);
892228060Sbapt        }
893228060Sbapt      fprintf (stderr, "\nend of keyword list\n\n");
894228060Sbapt    }
895228060Sbapt
896228060Sbapt  if (option[RANDOM] || option.get_jump () == 0)
897228060Sbapt    /* We will use rand(), so initialize the random number generator.  */
898228060Sbapt    srand (static_cast<long>(time (0)));
899228060Sbapt
900228060Sbapt  _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
901228060Sbapt  _jump = option.get_jump ();
902228060Sbapt}
903228060Sbapt
904228060Sbapt/* Finds some _asso_values[] that fit.  */
905228060Sbapt
906228060Sbapt/* The idea is to choose the _asso_values[] one by one, in a way that
907228060Sbapt   a choice that has been made never needs to be undone later.  This
908228060Sbapt   means that we split the work into several steps.  Each step chooses
909228060Sbapt   one or more _asso_values[c].  The result of choosing one or more
910228060Sbapt   _asso_values[c] is that the partitioning of the keyword set gets
911228060Sbapt   broader.
912228060Sbapt   Look at this partitioning:  After every step, the _asso_values[] of a
913228060Sbapt   certain set C of characters are undetermined.  (At the beginning, C
914228060Sbapt   is the set of characters c with _occurrences[c] > 0.  At the end, C
915228060Sbapt   is empty.)  To each keyword K, we associate the multiset of _selchars
916228060Sbapt   for which the _asso_values[] are undetermined:
917228060Sbapt                    K  -->  K->_selchars intersect C.
918228060Sbapt   Consider two keywords equivalent if their value under this mapping is
919228060Sbapt   the same.  This introduces an equivalence relation on the set of
920228060Sbapt   keywords.  The equivalence classes partition the keyword set.  (At the
921228060Sbapt   beginning, the partition is the finest possible: each K is an equivalence
922228060Sbapt   class by itself, because all K have a different _selchars.  At the end,
923228060Sbapt   all K have been merged into a single equivalence class.)
924228060Sbapt   The partition before a step is always a refinement of the partition
925228060Sbapt   after the step.
926228060Sbapt   We choose the steps in such a way that the partition really becomes
927228060Sbapt   broader at each step.  (A step that only chooses an _asso_values[c]
928228060Sbapt   without changing the partition is better merged with the previous step,
929228060Sbapt   to avoid useless backtracking.)  */
930228060Sbapt
931228060Sbaptstruct EquivalenceClass
932228060Sbapt{
933228060Sbapt  /* The keywords in this equivalence class.  */
934228060Sbapt  KeywordExt_List *     _keywords;
935228060Sbapt  KeywordExt_List *     _keywords_last;
936228060Sbapt  /* The number of keywords in this equivalence class.  */
937228060Sbapt  unsigned int          _cardinality;
938228060Sbapt  /* The undetermined selected characters for the keywords in this
939228060Sbapt     equivalence class, as a canonically reordered multiset.  */
940228060Sbapt  unsigned int *        _undetermined_chars;
941228060Sbapt  unsigned int          _undetermined_chars_length;
942228060Sbapt
943228060Sbapt  EquivalenceClass *    _next;
944228060Sbapt};
945228060Sbapt
946228060Sbaptstruct Step
947228060Sbapt{
948228060Sbapt  /* The characters whose values are being determined in this step.  */
949228060Sbapt  unsigned int          _changing_count;
950228060Sbapt  unsigned int *        _changing;
951228060Sbapt  /* Exclusive upper bound for the _asso_values[c] of this step.
952228060Sbapt     A power of 2.  */
953228060Sbapt  unsigned int          _asso_value_max;
954228060Sbapt  /* The characters whose values will be determined after this step.  */
955228060Sbapt  bool *                _undetermined;
956228060Sbapt  /* The keyword set partition after this step.  */
957228060Sbapt  EquivalenceClass *    _partition;
958228060Sbapt  /* The expected number of iterations in this step.  */
959228060Sbapt  double                _expected_lower;
960228060Sbapt  double                _expected_upper;
961228060Sbapt
962228060Sbapt  Step *                _next;
963228060Sbapt};
964228060Sbapt
965228060Sbaptstatic inline bool
966228060Sbaptequals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
967228060Sbapt{
968228060Sbapt  while (len > 0)
969228060Sbapt    {
970228060Sbapt      if (*ptr1 != *ptr2)
971228060Sbapt        return false;
972228060Sbapt      ptr1++;
973228060Sbapt      ptr2++;
974228060Sbapt      len--;
975228060Sbapt    }
976228060Sbapt  return true;
977228060Sbapt}
978228060Sbapt
979228060SbaptEquivalenceClass *
980228060SbaptSearch::compute_partition (bool *undetermined) const
981228060Sbapt{
982228060Sbapt  EquivalenceClass *partition = NULL;
983228060Sbapt  EquivalenceClass *partition_last = NULL;
984228060Sbapt  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
985228060Sbapt    {
986228060Sbapt      KeywordExt *keyword = temp->first();
987228060Sbapt
988228060Sbapt      /* Compute the undetermined characters for this keyword.  */
989228060Sbapt      unsigned int *undetermined_chars =
990228060Sbapt        new unsigned int[keyword->_selchars_length];
991228060Sbapt      unsigned int undetermined_chars_length = 0;
992228060Sbapt
993228060Sbapt      for (int i = 0; i < keyword->_selchars_length; i++)
994228060Sbapt        if (undetermined[keyword->_selchars[i]])
995228060Sbapt          undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
996228060Sbapt
997228060Sbapt      /* Look up the equivalence class to which this keyword belongs.  */
998228060Sbapt      EquivalenceClass *equclass;
999228060Sbapt      for (equclass = partition; equclass; equclass = equclass->_next)
1000228060Sbapt        if (equclass->_undetermined_chars_length == undetermined_chars_length
1001228060Sbapt            && equals (equclass->_undetermined_chars, undetermined_chars,
1002228060Sbapt                       undetermined_chars_length))
1003228060Sbapt          break;
1004228060Sbapt      if (equclass == NULL)
1005228060Sbapt        {
1006228060Sbapt          equclass = new EquivalenceClass();
1007228060Sbapt          equclass->_keywords = NULL;
1008228060Sbapt          equclass->_keywords_last = NULL;
1009228060Sbapt          equclass->_cardinality = 0;
1010228060Sbapt          equclass->_undetermined_chars = undetermined_chars;
1011228060Sbapt          equclass->_undetermined_chars_length = undetermined_chars_length;
1012228060Sbapt          equclass->_next = NULL;
1013228060Sbapt          if (partition)
1014228060Sbapt            partition_last->_next = equclass;
1015228060Sbapt          else
1016228060Sbapt            partition = equclass;
1017228060Sbapt          partition_last = equclass;
1018228060Sbapt        }
1019228060Sbapt      else
1020228060Sbapt        delete[] undetermined_chars;
1021228060Sbapt
1022228060Sbapt      /* Add the keyword to the equivalence class.  */
1023228060Sbapt      KeywordExt_List *cons = new KeywordExt_List(keyword);
1024228060Sbapt      if (equclass->_keywords)
1025228060Sbapt        equclass->_keywords_last->rest() = cons;
1026228060Sbapt      else
1027228060Sbapt        equclass->_keywords = cons;
1028228060Sbapt      equclass->_keywords_last = cons;
1029228060Sbapt      equclass->_cardinality++;
1030228060Sbapt    }
1031228060Sbapt
1032228060Sbapt  /* Free some of the allocated memory.  The caller doesn't need it.  */
1033228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1034228060Sbapt    delete[] cls->_undetermined_chars;
1035228060Sbapt
1036228060Sbapt  return partition;
1037228060Sbapt}
1038228060Sbapt
1039228060Sbaptstatic void
1040228060Sbaptdelete_partition (EquivalenceClass *partition)
1041228060Sbapt{
1042228060Sbapt  while (partition != NULL)
1043228060Sbapt    {
1044228060Sbapt      EquivalenceClass *equclass = partition;
1045228060Sbapt      partition = equclass->_next;
1046228060Sbapt      delete_list (equclass->_keywords);
1047228060Sbapt      //delete[] equclass->_undetermined_chars; // already freed above
1048228060Sbapt      delete equclass;
1049228060Sbapt    }
1050228060Sbapt}
1051228060Sbapt
1052228060Sbapt/* Compute the possible number of collisions when _asso_values[c] is
1053228060Sbapt   chosen, leading to the given partition.  */
1054228060Sbaptunsigned int
1055228060SbaptSearch::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1056228060Sbapt{
1057228060Sbapt  /* Every equivalence class p is split according to the frequency of
1058228060Sbapt     occurrence of c, leading to equivalence classes p1, p2, ...
1059228060Sbapt     This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1060228060Sbapt     Return the sum of this expression over all equivalence classes.  */
1061228060Sbapt  unsigned int sum = 0;
1062228060Sbapt  unsigned int m = _max_selchars_length;
1063228060Sbapt  DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1064228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1065228060Sbapt    {
1066228060Sbapt      for (unsigned int i = 0; i <= m; i++)
1067228060Sbapt        split_cardinalities[i] = 0;
1068228060Sbapt
1069228060Sbapt      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1070228060Sbapt        {
1071228060Sbapt          KeywordExt *keyword = temp->first();
1072228060Sbapt
1073228060Sbapt          unsigned int count = 0;
1074228060Sbapt          for (int i = 0; i < keyword->_selchars_length; i++)
1075228060Sbapt            if (keyword->_selchars[i] == c)
1076228060Sbapt              count++;
1077228060Sbapt
1078228060Sbapt          split_cardinalities[count]++;
1079228060Sbapt        }
1080228060Sbapt
1081228060Sbapt      sum += cls->_cardinality * cls->_cardinality;
1082228060Sbapt      for (unsigned int i = 0; i <= m; i++)
1083228060Sbapt        sum -= split_cardinalities[i] * split_cardinalities[i];
1084228060Sbapt    }
1085228060Sbapt  FREE_DYNAMIC_ARRAY (split_cardinalities);
1086228060Sbapt  return sum;
1087228060Sbapt}
1088228060Sbapt
1089228060Sbapt/* Test whether adding c to the undetermined characters changes the given
1090228060Sbapt   partition.  */
1091228060Sbaptbool
1092228060SbaptSearch::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1093228060Sbapt{
1094228060Sbapt  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1095228060Sbapt    {
1096228060Sbapt      unsigned int first_count = UINT_MAX;
1097228060Sbapt
1098228060Sbapt      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1099228060Sbapt        {
1100228060Sbapt          KeywordExt *keyword = temp->first();
1101228060Sbapt
1102228060Sbapt          unsigned int count = 0;
1103228060Sbapt          for (int i = 0; i < keyword->_selchars_length; i++)
1104228060Sbapt            if (keyword->_selchars[i] == c)
1105228060Sbapt              count++;
1106228060Sbapt
1107228060Sbapt          if (temp == cls->_keywords)
1108228060Sbapt            first_count = count;
1109228060Sbapt          else if (count != first_count)
1110228060Sbapt            /* c would split this equivalence class.  */
1111228060Sbapt            return false;
1112228060Sbapt        }
1113228060Sbapt    }
1114228060Sbapt  return true;
1115228060Sbapt}
1116228060Sbapt
1117228060Sbaptvoid
1118228060SbaptSearch::find_asso_values ()
1119228060Sbapt{
1120228060Sbapt  Step *steps;
1121228060Sbapt
1122228060Sbapt  /* Determine the steps, starting with the last one.  */
1123228060Sbapt  {
1124228060Sbapt    bool *undetermined;
1125228060Sbapt    bool *determined;
1126228060Sbapt
1127228060Sbapt    steps = NULL;
1128228060Sbapt
1129228060Sbapt    undetermined = new bool[_alpha_size];
1130228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1131228060Sbapt      undetermined[c] = false;
1132228060Sbapt
1133228060Sbapt    determined = new bool[_alpha_size];
1134228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1135228060Sbapt      determined[c] = true;
1136228060Sbapt
1137228060Sbapt    for (;;)
1138228060Sbapt      {
1139228060Sbapt        /* Compute the partition that needs to be refined.  */
1140228060Sbapt        EquivalenceClass *partition = compute_partition (undetermined);
1141228060Sbapt
1142228060Sbapt        /* Determine the main character to be chosen in this step.
1143228060Sbapt           Choosing such a character c has the effect of splitting every
1144228060Sbapt           equivalence class (according the the frequency of occurrence of c).
1145228060Sbapt           We choose the c with the minimum number of possible collisions,
1146228060Sbapt           so that characters which lead to a large number of collisions get
1147228060Sbapt           handled early during the search.  */
1148228060Sbapt        unsigned int chosen_c;
1149228060Sbapt        unsigned int chosen_possible_collisions;
1150228060Sbapt        {
1151228060Sbapt          unsigned int best_c = 0;
1152228060Sbapt          unsigned int best_possible_collisions = UINT_MAX;
1153228060Sbapt          for (unsigned int c = 0; c < _alpha_size; c++)
1154228060Sbapt            if (_occurrences[c] > 0 && determined[c])
1155228060Sbapt              {
1156228060Sbapt                unsigned int possible_collisions =
1157228060Sbapt                  count_possible_collisions (partition, c);
1158228060Sbapt                if (possible_collisions < best_possible_collisions)
1159228060Sbapt                  {
1160228060Sbapt                    best_c = c;
1161228060Sbapt                    best_possible_collisions = possible_collisions;
1162228060Sbapt                  }
1163228060Sbapt              }
1164228060Sbapt          if (best_possible_collisions == UINT_MAX)
1165228060Sbapt            {
1166228060Sbapt              /* All c with _occurrences[c] > 0 are undetermined.  We are
1167228060Sbapt                 are the starting situation and don't need any more step.  */
1168228060Sbapt              delete_partition (partition);
1169228060Sbapt              break;
1170228060Sbapt            }
1171228060Sbapt          chosen_c = best_c;
1172228060Sbapt          chosen_possible_collisions = best_possible_collisions;
1173228060Sbapt        }
1174228060Sbapt
1175228060Sbapt        /* We need one more step.  */
1176228060Sbapt        Step *step = new Step();
1177228060Sbapt
1178228060Sbapt        step->_undetermined = new bool[_alpha_size];
1179228060Sbapt        memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1180228060Sbapt
1181228060Sbapt        step->_partition = partition;
1182228060Sbapt
1183228060Sbapt        /* Now determine how the equivalence classes will be before this
1184228060Sbapt           step.  */
1185228060Sbapt        undetermined[chosen_c] = true;
1186228060Sbapt        partition = compute_partition (undetermined);
1187228060Sbapt
1188228060Sbapt        /* Now determine which other characters should be determined in this
1189228060Sbapt           step, because they will not change the equivalence classes at
1190228060Sbapt           this point.  It is the set of all c which, for all equivalence
1191228060Sbapt           classes, have the same frequency of occurrence in every keyword
1192228060Sbapt           of the equivalence class.  */
1193228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1194228060Sbapt          if (_occurrences[c] > 0 && determined[c]
1195228060Sbapt              && unchanged_partition (partition, c))
1196228060Sbapt            {
1197228060Sbapt              undetermined[c] = true;
1198228060Sbapt              determined[c] = false;
1199228060Sbapt            }
1200228060Sbapt
1201228060Sbapt        /* main_c must be one of these.  */
1202228060Sbapt        if (determined[chosen_c])
1203228060Sbapt          abort ();
1204228060Sbapt
1205228060Sbapt        /* Now the set of changing characters of this step.  */
1206228060Sbapt        unsigned int changing_count;
1207228060Sbapt
1208228060Sbapt        changing_count = 0;
1209228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1210228060Sbapt          if (undetermined[c] && !step->_undetermined[c])
1211228060Sbapt            changing_count++;
1212228060Sbapt
1213228060Sbapt        unsigned int *changing = new unsigned int[changing_count];
1214228060Sbapt        changing_count = 0;
1215228060Sbapt        for (unsigned int c = 0; c < _alpha_size; c++)
1216228060Sbapt          if (undetermined[c] && !step->_undetermined[c])
1217228060Sbapt            changing[changing_count++] = c;
1218228060Sbapt
1219228060Sbapt        step->_changing = changing;
1220228060Sbapt        step->_changing_count = changing_count;
1221228060Sbapt
1222228060Sbapt        step->_asso_value_max = _asso_value_max;
1223228060Sbapt
1224228060Sbapt        step->_expected_lower =
1225228060Sbapt          exp (static_cast<double>(chosen_possible_collisions)
1226228060Sbapt               / static_cast<double>(_max_hash_value));
1227228060Sbapt        step->_expected_upper =
1228228060Sbapt          exp (static_cast<double>(chosen_possible_collisions)
1229228060Sbapt               / static_cast<double>(_asso_value_max));
1230228060Sbapt
1231228060Sbapt        delete_partition (partition);
1232228060Sbapt
1233228060Sbapt        step->_next = steps;
1234228060Sbapt        steps = step;
1235228060Sbapt      }
1236228060Sbapt
1237228060Sbapt    delete[] determined;
1238228060Sbapt    delete[] undetermined;
1239228060Sbapt  }
1240228060Sbapt
1241228060Sbapt  if (option[DEBUG])
1242228060Sbapt    {
1243228060Sbapt      unsigned int stepno = 0;
1244228060Sbapt      for (Step *step = steps; step; step = step->_next)
1245228060Sbapt        {
1246228060Sbapt          stepno++;
1247228060Sbapt          fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1248228060Sbapt          for (unsigned int i = 0; i < step->_changing_count; i++)
1249228060Sbapt            {
1250228060Sbapt              if (i > 0)
1251228060Sbapt                fprintf (stderr, ",");
1252228060Sbapt              fprintf (stderr, "'%c'", step->_changing[i]);
1253228060Sbapt            }
1254228060Sbapt          fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1255228060Sbapt                   step->_expected_lower, step->_expected_upper);
1256228060Sbapt          fprintf (stderr, "Keyword equivalence classes:\n");
1257228060Sbapt          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1258228060Sbapt            {
1259228060Sbapt              fprintf (stderr, "\n");
1260228060Sbapt              for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1261228060Sbapt                {
1262228060Sbapt                  KeywordExt *keyword = temp->first();
1263228060Sbapt                  fprintf (stderr, "  %.*s\n",
1264228060Sbapt                           keyword->_allchars_length, keyword->_allchars);
1265228060Sbapt                }
1266228060Sbapt            }
1267228060Sbapt          fprintf (stderr, "\n");
1268228060Sbapt        }
1269228060Sbapt    }
1270228060Sbapt
1271228060Sbapt  /* Initialize _asso_values[].  (The value given here matters only
1272228060Sbapt     for those c which occur in all keywords with equal multiplicity.)  */
1273228060Sbapt  for (unsigned int c = 0; c < _alpha_size; c++)
1274228060Sbapt    _asso_values[c] = 0;
1275228060Sbapt
1276228060Sbapt  unsigned int stepno = 0;
1277228060Sbapt  for (Step *step = steps; step; step = step->_next)
1278228060Sbapt    {
1279228060Sbapt      stepno++;
1280228060Sbapt
1281228060Sbapt      /* Initialize the asso_values[].  */
1282228060Sbapt      unsigned int k = step->_changing_count;
1283228060Sbapt      for (unsigned int i = 0; i < k; i++)
1284228060Sbapt        {
1285228060Sbapt          unsigned int c = step->_changing[i];
1286228060Sbapt          _asso_values[c] =
1287228060Sbapt            (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1288228060Sbapt            & (step->_asso_value_max - 1);
1289228060Sbapt        }
1290228060Sbapt
1291228060Sbapt      unsigned int iterations = 0;
1292228060Sbapt      DYNAMIC_ARRAY (iter, unsigned int, k);
1293228060Sbapt      for (unsigned int i = 0; i < k; i++)
1294228060Sbapt        iter[i] = 0;
1295228060Sbapt      unsigned int ii = (_jump != 0 ? k - 1 : 0);
1296228060Sbapt
1297228060Sbapt      for (;;)
1298228060Sbapt        {
1299228060Sbapt          /* Test whether these asso_values[] lead to collisions among
1300228060Sbapt             the equivalence classes that should be collision-free.  */
1301228060Sbapt          bool has_collision = false;
1302228060Sbapt          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1303228060Sbapt            {
1304228060Sbapt              /* Iteration Number array is a win, O(1) initialization time!  */
1305228060Sbapt              _collision_detector->clear ();
1306228060Sbapt
1307228060Sbapt              for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1308228060Sbapt                {
1309228060Sbapt                  KeywordExt *keyword = ptr->first();
1310228060Sbapt
1311228060Sbapt                  /* Compute the new hash code for the keyword, leaving apart
1312228060Sbapt                     the yet undetermined asso_values[].  */
1313228060Sbapt                  int hashcode;
1314228060Sbapt                  {
1315228060Sbapt                    int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1316228060Sbapt                    const unsigned int *p = keyword->_selchars;
1317228060Sbapt                    int i = keyword->_selchars_length;
1318228060Sbapt                    for (; i > 0; p++, i--)
1319228060Sbapt                      if (!step->_undetermined[*p])
1320228060Sbapt                        sum += _asso_values[*p];
1321228060Sbapt                    hashcode = sum;
1322228060Sbapt                  }
1323228060Sbapt
1324228060Sbapt                  /* See whether it collides with another keyword's hash code,
1325228060Sbapt                     from the same equivalence class.  */
1326228060Sbapt                  if (_collision_detector->set_bit (hashcode))
1327228060Sbapt                    {
1328228060Sbapt                      has_collision = true;
1329228060Sbapt                      break;
1330228060Sbapt                    }
1331228060Sbapt                }
1332228060Sbapt
1333228060Sbapt              /* Don't need to continue looking at the other equivalence
1334228060Sbapt                 classes if we already have found a collision.  */
1335228060Sbapt              if (has_collision)
1336228060Sbapt                break;
1337228060Sbapt            }
1338228060Sbapt
1339228060Sbapt          iterations++;
1340228060Sbapt          if (!has_collision)
1341228060Sbapt            break;
1342228060Sbapt
1343228060Sbapt          /* Try other asso_values[].  */
1344228060Sbapt          if (_jump != 0)
1345228060Sbapt            {
1346228060Sbapt              /* The way we try various values for
1347228060Sbapt                   asso_values[step->_changing[0],...step->_changing[k-1]]
1348228060Sbapt                 is like this:
1349228060Sbapt                 for (bound = 0,1,...)
1350228060Sbapt                   for (ii = 0,...,k-1)
1351228060Sbapt                     iter[ii] := bound
1352228060Sbapt                     iter[0..ii-1] := values <= bound
1353228060Sbapt                     iter[ii+1..k-1] := values < bound
1354228060Sbapt                 and
1355228060Sbapt                   asso_values[step->_changing[i]] =
1356228060Sbapt                     _initial_asso_value + iter[i] * _jump.
1357228060Sbapt                 This makes it more likely to find small asso_values[].
1358228060Sbapt               */
1359228060Sbapt              unsigned int bound = iter[ii];
1360228060Sbapt              unsigned int i = 0;
1361228060Sbapt              while (i < ii)
1362228060Sbapt                {
1363228060Sbapt                  unsigned int c = step->_changing[i];
1364228060Sbapt                  iter[i]++;
1365228060Sbapt                  _asso_values[c] =
1366228060Sbapt                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1367228060Sbapt                  if (iter[i] <= bound)
1368228060Sbapt                    goto found_next;
1369228060Sbapt                  _asso_values[c] =
1370228060Sbapt                    (_asso_values[c] - iter[i] * _jump)
1371228060Sbapt                    & (step->_asso_value_max - 1);
1372228060Sbapt                  iter[i] = 0;
1373228060Sbapt                  i++;
1374228060Sbapt                }
1375228060Sbapt              i = ii + 1;
1376228060Sbapt              while (i < k)
1377228060Sbapt                {
1378228060Sbapt                  unsigned int c = step->_changing[i];
1379228060Sbapt                  iter[i]++;
1380228060Sbapt                  _asso_values[c] =
1381228060Sbapt                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1382228060Sbapt                  if (iter[i] < bound)
1383228060Sbapt                    goto found_next;
1384228060Sbapt                  _asso_values[c] =
1385228060Sbapt                    (_asso_values[c] - iter[i] * _jump)
1386228060Sbapt                    & (step->_asso_value_max - 1);
1387228060Sbapt                  iter[i] = 0;
1388228060Sbapt                  i++;
1389228060Sbapt                }
1390228060Sbapt              /* Switch from one ii to the next.  */
1391228060Sbapt              {
1392228060Sbapt                unsigned int c = step->_changing[ii];
1393228060Sbapt                _asso_values[c] =
1394228060Sbapt                  (_asso_values[c] - bound * _jump)
1395228060Sbapt                  & (step->_asso_value_max - 1);
1396228060Sbapt                iter[ii] = 0;
1397228060Sbapt              }
1398228060Sbapt              /* Here all iter[i] == 0.  */
1399228060Sbapt              ii++;
1400228060Sbapt              if (ii == k)
1401228060Sbapt                {
1402228060Sbapt                  ii = 0;
1403228060Sbapt                  bound++;
1404228060Sbapt                  if (bound == step->_asso_value_max)
1405228060Sbapt                    {
1406228060Sbapt                      /* Out of search space!  We can either backtrack, or
1407228060Sbapt                         increase the available search space of this step.
1408228060Sbapt                         It seems simpler to choose the latter solution.  */
1409228060Sbapt                      step->_asso_value_max = 2 * step->_asso_value_max;
1410228060Sbapt                      if (step->_asso_value_max > _asso_value_max)
1411228060Sbapt                        {
1412228060Sbapt                          _asso_value_max = step->_asso_value_max;
1413228060Sbapt                          /* Reinitialize _max_hash_value.  */
1414228060Sbapt                          _max_hash_value =
1415228060Sbapt                            (option[NOLENGTH] ? 0 : _max_key_len)
1416228060Sbapt                            + (_asso_value_max - 1) * _max_selchars_length;
1417228060Sbapt                          /* Reinitialize _collision_detector.  */
1418228060Sbapt                          delete _collision_detector;
1419228060Sbapt                          _collision_detector =
1420228060Sbapt                            new Bool_Array (_max_hash_value + 1);
1421228060Sbapt                        }
1422228060Sbapt                    }
1423228060Sbapt                }
1424228060Sbapt              {
1425228060Sbapt                unsigned int c = step->_changing[ii];
1426228060Sbapt                iter[ii] = bound;
1427228060Sbapt                _asso_values[c] =
1428228060Sbapt                  (_asso_values[c] + bound * _jump)
1429228060Sbapt                  & (step->_asso_value_max - 1);
1430228060Sbapt              }
1431228060Sbapt             found_next: ;
1432228060Sbapt            }
1433228060Sbapt          else
1434228060Sbapt            {
1435228060Sbapt              /* Random.  */
1436228060Sbapt              unsigned int c = step->_changing[ii];
1437228060Sbapt              _asso_values[c] =
1438228060Sbapt                (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1439228060Sbapt              /* Next time, change the next c.  */
1440228060Sbapt              ii++;
1441228060Sbapt              if (ii == k)
1442228060Sbapt                ii = 0;
1443228060Sbapt            }
1444228060Sbapt        }
1445228060Sbapt      FREE_DYNAMIC_ARRAY (iter);
1446228060Sbapt
1447228060Sbapt      if (option[DEBUG])
1448228060Sbapt        {
1449228060Sbapt          fprintf (stderr, "Step %u chose _asso_values[", stepno);
1450228060Sbapt          for (unsigned int i = 0; i < step->_changing_count; i++)
1451228060Sbapt            {
1452228060Sbapt              if (i > 0)
1453228060Sbapt                fprintf (stderr, ",");
1454228060Sbapt              fprintf (stderr, "'%c'", step->_changing[i]);
1455228060Sbapt            }
1456228060Sbapt          fprintf (stderr, "] in %u iterations.\n", iterations);
1457228060Sbapt        }
1458228060Sbapt    }
1459228060Sbapt
1460228060Sbapt  /* Free allocated memory.  */
1461228060Sbapt  while (steps != NULL)
1462228060Sbapt    {
1463228060Sbapt      Step *step = steps;
1464228060Sbapt      steps = step->_next;
1465228060Sbapt      delete[] step->_changing;
1466228060Sbapt      delete[] step->_undetermined;
1467228060Sbapt      delete_partition (step->_partition);
1468228060Sbapt      delete step;
1469228060Sbapt    }
1470228060Sbapt}
1471228060Sbapt
1472228060Sbapt/* Computes a keyword's hash value, relative to the current _asso_values[],
1473228060Sbapt   and stores it in keyword->_hash_value.  */
1474228060Sbapt
1475228060Sbaptinline int
1476228060SbaptSearch::compute_hash (KeywordExt *keyword) const
1477228060Sbapt{
1478228060Sbapt  int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1479228060Sbapt
1480228060Sbapt  const unsigned int *p = keyword->_selchars;
1481228060Sbapt  int i = keyword->_selchars_length;
1482228060Sbapt  for (; i > 0; p++, i--)
1483228060Sbapt    sum += _asso_values[*p];
1484228060Sbapt
1485228060Sbapt  return keyword->_hash_value = sum;
1486228060Sbapt}
1487228060Sbapt
1488228060Sbapt/* Finds good _asso_values[].  */
1489228060Sbapt
1490228060Sbaptvoid
1491228060SbaptSearch::find_good_asso_values ()
1492228060Sbapt{
1493228060Sbapt  prepare_asso_values ();
1494228060Sbapt
1495228060Sbapt  /* Search for good _asso_values[].  */
1496228060Sbapt  int asso_iteration;
1497228060Sbapt  if ((asso_iteration = option.get_asso_iterations ()) == 0)
1498228060Sbapt    /* Try only the given _initial_asso_value and _jump.  */
1499228060Sbapt    find_asso_values ();
1500228060Sbapt  else
1501228060Sbapt    {
1502228060Sbapt      /* Try different pairs of _initial_asso_value and _jump, in the
1503228060Sbapt         following order:
1504228060Sbapt           (0, 1)
1505228060Sbapt           (1, 1)
1506228060Sbapt           (2, 1) (0, 3)
1507228060Sbapt           (3, 1) (1, 3)
1508228060Sbapt           (4, 1) (2, 3) (0, 5)
1509228060Sbapt           (5, 1) (3, 3) (1, 5)
1510228060Sbapt           ..... */
1511228060Sbapt      KeywordExt_List *saved_head = _head;
1512228060Sbapt      int best_initial_asso_value = 0;
1513228060Sbapt      int best_jump = 1;
1514228060Sbapt      int *best_asso_values = new int[_alpha_size];
1515228060Sbapt      int best_collisions = INT_MAX;
1516228060Sbapt      int best_max_hash_value = INT_MAX;
1517228060Sbapt
1518228060Sbapt      _initial_asso_value = 0; _jump = 1;
1519228060Sbapt      for (;;)
1520228060Sbapt        {
1521228060Sbapt          /* Restore the keyword list in its original order.  */
1522228060Sbapt          _head = copy_list (saved_head);
1523228060Sbapt          /* Find good _asso_values[].  */
1524228060Sbapt          find_asso_values ();
1525228060Sbapt          /* Test whether it is the best solution so far.  */
1526228060Sbapt          int collisions = 0;
1527228060Sbapt          int max_hash_value = INT_MIN;
1528228060Sbapt          _collision_detector->clear ();
1529228060Sbapt          for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1530228060Sbapt            {
1531228060Sbapt              KeywordExt *keyword = ptr->first();
1532228060Sbapt              int hashcode = compute_hash (keyword);
1533228060Sbapt              if (max_hash_value < hashcode)
1534228060Sbapt                max_hash_value = hashcode;
1535228060Sbapt              if (_collision_detector->set_bit (hashcode))
1536228060Sbapt                collisions++;
1537228060Sbapt            }
1538228060Sbapt          if (collisions < best_collisions
1539228060Sbapt              || (collisions == best_collisions
1540228060Sbapt                  && max_hash_value < best_max_hash_value))
1541228060Sbapt            {
1542228060Sbapt              memcpy (best_asso_values, _asso_values,
1543228060Sbapt                      _alpha_size * sizeof (_asso_values[0]));
1544228060Sbapt              best_collisions = collisions;
1545228060Sbapt              best_max_hash_value = max_hash_value;
1546228060Sbapt            }
1547228060Sbapt          /* Delete the copied keyword list.  */
1548228060Sbapt          delete_list (_head);
1549228060Sbapt
1550228060Sbapt          if (--asso_iteration == 0)
1551228060Sbapt            break;
1552228060Sbapt          /* Prepare for next iteration.  */
1553228060Sbapt          if (_initial_asso_value >= 2)
1554228060Sbapt            _initial_asso_value -= 2, _jump += 2;
1555228060Sbapt          else
1556228060Sbapt            _initial_asso_value += _jump, _jump = 1;
1557228060Sbapt        }
1558228060Sbapt      _head = saved_head;
1559228060Sbapt      /* Install the best found asso_values.  */
1560228060Sbapt      _initial_asso_value = best_initial_asso_value;
1561228060Sbapt      _jump = best_jump;
1562228060Sbapt      memcpy (_asso_values, best_asso_values,
1563228060Sbapt              _alpha_size * sizeof (_asso_values[0]));
1564228060Sbapt      delete[] best_asso_values;
1565228060Sbapt      /* The keywords' _hash_value fields are recomputed below.  */
1566228060Sbapt    }
1567228060Sbapt}
1568228060Sbapt
1569228060Sbapt/* ========================================================================= */
1570228060Sbapt
1571228060Sbapt/* Comparison function for sorting by increasing _hash_value.  */
1572228060Sbaptstatic bool
1573228060Sbaptless_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1574228060Sbapt{
1575228060Sbapt  return keyword1->_hash_value < keyword2->_hash_value;
1576228060Sbapt}
1577228060Sbapt
1578228060Sbapt/* Sorts the keyword list by hash value.  */
1579228060Sbapt
1580228060Sbaptvoid
1581228060SbaptSearch::sort ()
1582228060Sbapt{
1583228060Sbapt  _head = mergesort_list (_head, less_by_hash_value);
1584228060Sbapt}
1585228060Sbapt
1586228060Sbaptvoid
1587228060SbaptSearch::optimize ()
1588228060Sbapt{
1589228060Sbapt  /* Preparations.  */
1590228060Sbapt  prepare ();
1591228060Sbapt
1592228060Sbapt  /* Step 1: Finding good byte positions.  */
1593228060Sbapt  find_positions ();
1594228060Sbapt
1595228060Sbapt  /* Step 2: Finding good alpha increments.  */
1596228060Sbapt  find_alpha_inc ();
1597228060Sbapt
1598228060Sbapt  /* Step 3: Finding good asso_values.  */
1599228060Sbapt  find_good_asso_values ();
1600228060Sbapt
1601228060Sbapt  /* Make one final check, just to make sure nothing weird happened.... */
1602228060Sbapt  _collision_detector->clear ();
1603228060Sbapt  for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1604228060Sbapt    {
1605228060Sbapt      KeywordExt *curr = curr_ptr->first();
1606228060Sbapt      unsigned int hashcode = compute_hash (curr);
1607228060Sbapt      if (_collision_detector->set_bit (hashcode))
1608228060Sbapt        {
1609228060Sbapt          /* This shouldn't happen.  proj1, proj2, proj3 must have been
1610228060Sbapt             computed to be injective on the given keyword set.  */
1611228060Sbapt          fprintf (stderr,
1612228060Sbapt                   "\nInternal error, unexpected duplicate hash code\n");
1613228060Sbapt          if (option[POSITIONS])
1614228060Sbapt            fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1615228060Sbapt          else
1616228060Sbapt            fprintf (stderr, "try options -m or -r.\n\n");
1617228060Sbapt          exit (1);
1618228060Sbapt        }
1619228060Sbapt    }
1620228060Sbapt
1621228060Sbapt  /* Sorts the keyword list by hash value.  */
1622228060Sbapt  sort ();
1623228060Sbapt
1624228060Sbapt  /* Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely
1625228060Sbapt     necessary, but speeds up the lookup function in many cases of lookup
1626228060Sbapt     failure: no string comparison is needed once the hash value of a string
1627228060Sbapt     is larger than the hash value of any keyword.  */
1628228060Sbapt  int max_hash_value;
1629228060Sbapt  {
1630228060Sbapt    KeywordExt_List *temp;
1631228060Sbapt    for (temp = _head; temp->rest(); temp = temp->rest())
1632228060Sbapt      ;
1633228060Sbapt    max_hash_value = temp->first()->_hash_value;
1634228060Sbapt  }
1635228060Sbapt  for (unsigned int c = 0; c < _alpha_size; c++)
1636228060Sbapt    if (_occurrences[c] == 0)
1637228060Sbapt      _asso_values[c] = max_hash_value + 1;
1638228060Sbapt
1639228060Sbapt  /* Propagate unified asso_values.  */
1640228060Sbapt  if (_alpha_unify)
1641228060Sbapt    for (unsigned int c = 0; c < _alpha_size; c++)
1642228060Sbapt      if (_alpha_unify[c] != c)
1643228060Sbapt        _asso_values[c] = _asso_values[_alpha_unify[c]];
1644228060Sbapt}
1645228060Sbapt
1646228060Sbapt/* Prints out some diagnostics upon completion.  */
1647228060Sbapt
1648228060SbaptSearch::~Search ()
1649228060Sbapt{
1650228060Sbapt  delete _collision_detector;
1651228060Sbapt  if (option[DEBUG])
1652228060Sbapt    {
1653228060Sbapt      fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1654228060Sbapt
1655228060Sbapt      for (unsigned int i = 0; i < _alpha_size; i++)
1656228060Sbapt        if (_occurrences[i])
1657228060Sbapt          fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1658228060Sbapt                   i, _asso_values[i], i, _occurrences[i]);
1659228060Sbapt
1660228060Sbapt      fprintf (stderr, "end table dumping\n");
1661228060Sbapt
1662228060Sbapt      fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1663228060Sbapt               "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1664228060Sbapt               _list_len, _total_keys, _total_duplicates, _max_key_len);
1665228060Sbapt
1666228060Sbapt      int field_width = _max_selchars_length;
1667228060Sbapt      fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1668228060Sbapt               field_width, "selchars");
1669228060Sbapt      for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1670228060Sbapt        {
1671228060Sbapt          fprintf (stderr, "%11d,%11d,%6d, ",
1672228060Sbapt                   ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1673228060Sbapt          if (field_width > ptr->first()->_selchars_length)
1674228060Sbapt            fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1675228060Sbapt          for (int j = 0; j < ptr->first()->_selchars_length; j++)
1676228060Sbapt            putc (ptr->first()->_selchars[j], stderr);
1677228060Sbapt          fprintf (stderr, ", %.*s\n",
1678228060Sbapt                   ptr->first()->_allchars_length, ptr->first()->_allchars);
1679228060Sbapt        }
1680228060Sbapt
1681228060Sbapt      fprintf (stderr, "End dumping list.\n\n");
1682228060Sbapt    }
1683228060Sbapt  delete[] _asso_values;
1684228060Sbapt  delete[] _occurrences;
1685228060Sbapt  delete[] _alpha_unify;
1686228060Sbapt  delete[] _alpha_inc;
1687228060Sbapt}
1688