1/* Search algorithm.
2   Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3   Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4   and Bruno Haible <bruno@clisp.org>.
5
6   This file is part of GNU GPERF.
7
8   GNU GPERF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU GPERF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation, Inc.,
21   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
22
23/* Specification. */
24#include "search.h"
25
26#include <stdio.h>
27#include <stdlib.h> /* declares exit(), rand(), srand() */
28#include <string.h> /* declares memset(), memcmp() */
29#include <time.h> /* declares time() */
30#include <math.h> /* declares exp() */
31#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32#include "options.h"
33#include "hash-table.h"
34#include "config.h"
35
36/* ============================== Portability ============================== */
37
38/* Assume ISO C++ 'for' scoping rule.  */
39/* This code is used to work around scoping issues with visual studio 6 from
40 * 1998.  Comment it out here to queisce numerous -Wdangling-else warnings
41 * from clang.
42#define for if (0) ; else for */
43
44/* Dynamically allocated array with dynamic extent:
45
46   Example:
47       DYNAMIC_ARRAY (my_array, int, n);
48       ...
49       FREE_DYNAMIC_ARRAY (my_array);
50
51   Attention: depending on your implementation my_array is either the array
52   itself or a pointer to the array! Always use my_array only as expression!
53 */
54#if HAVE_DYNAMIC_ARRAY
55  #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
56  #define FREE_DYNAMIC_ARRAY(var)
57#else
58  #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
59  #define FREE_DYNAMIC_ARRAY(var) delete[] var
60#endif
61
62/* ================================ Theory ================================= */
63
64/* The general form of the hash function is
65
66      hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
67                       + len (keyword)
68
69   where Pos is a set of byte positions,
70   each alpha_inc[i] is a nonnegative integer,
71   each asso_values[c] is a nonnegative integer,
72   len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
73
74   Theorem 1: If all keywords are different, there is a set Pos such that
75   all tuples (keyword[i] : i in Pos) are different.
76
77   Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
78   are nonnegative integers alpha_inc[i] such that all multisets
79   {keyword[i] + alpha_inc[i] : i in Pos} are different.
80
81   Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
82
83   Theorem 3: If all multisets selchars[keyword] are different, there are
84   nonnegative integers asso_values[c] such that all hash values
85   sum (asso_values[c] : c in selchars[keyword]) are different.
86
87   Based on these three facts, we find the hash function in three steps:
88
89   Step 1 (Finding good byte positions):
90   Find a set Pos, as small as possible, such that all tuples
91   (keyword[i] : i in Pos) are different.
92
93   Step 2 (Finding good alpha increments):
94   Find nonnegative integers alpha_inc[i], as many of them as possible being
95   zero, and the others being as small as possible, such that all multisets
96   {keyword[i] + alpha_inc[i] : i in Pos} are different.
97
98   Step 3 (Finding good asso_values):
99   Find asso_values[c] such that all hash (keyword) are different.
100
101   In other words, each step finds a projection that is injective on the
102   given finite set:
103     proj1 : String --> Map (Pos --> N)
104     proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
105     proj3 : Map (Pos --> N) / S(Pos) --> N
106   where
107     N denotes the set of nonnegative integers,
108     Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
109     S(Pos) is the symmetric group over Pos.
110
111   This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
112   modifications apply:
113     proj1 : String --> Map (Pos --> N) x N
114     proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
115     proj3 : Map (Pos --> N) / S(Pos) x N --> N
116
117   For a case-insensitive hash function, the general form is
118
119      hash (keyword) =
120        sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
121        + len (keyword)
122
123   where alpha_unify[c] is chosen so that an upper/lower case change in
124   keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
125 */
126
127/* ==================== Initialization and Preparation ===================== */
128
129Search::Search (KeywordExt_List *list)
130  : _head (list)
131{
132}
133
134void
135Search::prepare ()
136{
137  /* Compute the total number of keywords.  */
138  _total_keys = 0;
139  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
140    _total_keys++;
141
142  /* Compute the minimum and maximum keyword length.  */
143  _max_key_len = INT_MIN;
144  _min_key_len = INT_MAX;
145  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
146    {
147      KeywordExt *keyword = temp->first();
148
149      if (_max_key_len < keyword->_allchars_length)
150        _max_key_len = keyword->_allchars_length;
151      if (_min_key_len > keyword->_allchars_length)
152        _min_key_len = keyword->_allchars_length;
153    }
154
155  /* Exit program if an empty string is used as keyword, since the comparison
156     expressions don't work correctly for looking up an empty string.  */
157  if (_min_key_len == 0)
158    {
159      fprintf (stderr, "Empty input keyword is not allowed.\n"
160               "To recognize an empty input keyword, your code should check for\n"
161               "len == 0 before calling the gperf generated lookup function.\n");
162      exit (1);
163    }
164
165  /* Exit program if the characters in the keywords are not in the required
166     range.  */
167  if (option[SEVENBIT])
168    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
169      {
170        KeywordExt *keyword = temp->first();
171
172        const char *k = keyword->_allchars;
173        for (int i = keyword->_allchars_length; i > 0; k++, i--)
174          if (!(static_cast<unsigned char>(*k) < 128))
175            {
176              fprintf (stderr, "Option --seven-bit has been specified,\n"
177                       "but keyword \"%.*s\" contains non-ASCII characters.\n"
178                       "Try removing option --seven-bit.\n",
179                       keyword->_allchars_length, keyword->_allchars);
180              exit (1);
181            }
182      }
183}
184
185/* ====================== Finding good byte positions ====================== */
186
187/* Computes the upper bound on the indices passed to asso_values[],
188   assuming no alpha_increments.  */
189unsigned int
190Search::compute_alpha_size () const
191{
192  return (option[SEVENBIT] ? 128 : 256);
193}
194
195/* Computes the unification rules between different asso_values[c],
196   assuming no alpha_increments.  */
197unsigned int *
198Search::compute_alpha_unify () const
199{
200  if (option[UPPERLOWER])
201    {
202      /* Uppercase to lowercase mapping.  */
203      unsigned int alpha_size = compute_alpha_size();
204      unsigned int *alpha_unify = new unsigned int[alpha_size];
205      for (unsigned int c = 0; c < alpha_size; c++)
206        alpha_unify[c] = c;
207      for (unsigned int c = 'A'; c <= 'Z'; c++)
208        alpha_unify[c] = c + ('a'-'A');
209      return alpha_unify;
210    }
211  else
212    /* Identity mapping.  */
213    return NULL;
214}
215
216/* Initializes each keyword's _selchars array.  */
217void
218Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
219{
220  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221    temp->first()->init_selchars_tuple(positions, alpha_unify);
222}
223
224/* Deletes each keyword's _selchars array.  */
225void
226Search::delete_selchars () const
227{
228  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
229    temp->first()->delete_selchars();
230}
231
232/* Count the duplicate keywords that occur with a given set of positions.
233   In other words, it returns the difference
234     # K - # proj1 (K)
235   where K is the multiset of given keywords.  */
236unsigned int
237Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
238{
239  /* Run through the keyword list and count the duplicates incrementally.
240     The result does not depend on the order of the keyword list, thanks to
241     the formula above.  */
242  init_selchars_tuple (positions, alpha_unify);
243
244  unsigned int count = 0;
245  {
246    Hash_Table representatives (_total_keys, option[NOLENGTH]);
247    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
248      {
249        KeywordExt *keyword = temp->first();
250        if (representatives.insert (keyword))
251          count++;
252      }
253  }
254
255  delete_selchars ();
256
257  return count;
258}
259
260/* Find good key positions.  */
261
262void
263Search::find_positions ()
264{
265  /* If the user gave the key positions, we use them.  */
266  if (option[POSITIONS])
267    {
268      _key_positions = option.get_key_positions();
269      return;
270    }
271
272  /* Compute preliminary alpha_unify table.  */
273  unsigned int *alpha_unify = compute_alpha_unify ();
274
275  /* 1. Find positions that must occur in order to distinguish duplicates.  */
276  Positions mandatory;
277
278  if (!option[DUP])
279    {
280      for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
281        {
282          KeywordExt *keyword1 = l1->first();
283          for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
284            {
285              KeywordExt *keyword2 = l2->first();
286
287              /* If keyword1 and keyword2 have the same length and differ
288                 in just one position, and it is not the last character,
289                 this position is mandatory.  */
290              if (keyword1->_allchars_length == keyword2->_allchars_length)
291                {
292                  int n = keyword1->_allchars_length;
293                  int i;
294                  for (i = 0; i < n - 1; i++)
295                    {
296                      unsigned char c1 = keyword1->_allchars[i];
297                      unsigned char c2 = keyword2->_allchars[i];
298                      if (option[UPPERLOWER])
299                        {
300                          if (c1 >= 'A' && c1 <= 'Z')
301                            c1 += 'a' - 'A';
302                          if (c2 >= 'A' && c2 <= 'Z')
303                            c2 += 'a' - 'A';
304                        }
305                      if (c1 != c2)
306                        break;
307                    }
308                  if (i < n - 1)
309                    {
310                      int j;
311                      for (j = i + 1; j < n; j++)
312                        {
313                          unsigned char c1 = keyword1->_allchars[j];
314                          unsigned char c2 = keyword2->_allchars[j];
315                          if (option[UPPERLOWER])
316                            {
317                              if (c1 >= 'A' && c1 <= 'Z')
318                                c1 += 'a' - 'A';
319                              if (c2 >= 'A' && c2 <= 'Z')
320                                c2 += 'a' - 'A';
321                            }
322                          if (c1 != c2)
323                            break;
324                        }
325                      if (j >= n)
326                        {
327                          /* Position i is mandatory.  */
328                          if (!mandatory.contains (i))
329                            mandatory.add (i);
330                        }
331                    }
332                }
333            }
334        }
335    }
336
337  /* 2. Add positions, as long as this decreases the duplicates count.  */
338  int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
339              ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
340  Positions current = mandatory;
341  unsigned int current_duplicates_count =
342    count_duplicates_tuple (current, alpha_unify);
343  for (;;)
344    {
345      Positions best;
346      unsigned int best_duplicates_count = UINT_MAX;
347
348      for (int i = imax; i >= -1; i--)
349        if (!current.contains (i))
350          {
351            Positions tryal = current;
352            tryal.add (i);
353            unsigned int try_duplicates_count =
354              count_duplicates_tuple (tryal, alpha_unify);
355
356            /* We prefer 'try' to 'best' if it produces less duplicates,
357               or if it produces the same number of duplicates but with
358               a more efficient hash function.  */
359            if (try_duplicates_count < best_duplicates_count
360                || (try_duplicates_count == best_duplicates_count && i >= 0))
361              {
362                best = tryal;
363                best_duplicates_count = try_duplicates_count;
364              }
365          }
366
367      /* Stop adding positions when it gives no improvement.  */
368      if (best_duplicates_count >= current_duplicates_count)
369        break;
370
371      current = best;
372      current_duplicates_count = best_duplicates_count;
373    }
374
375  /* 3. Remove positions, as long as this doesn't increase the duplicates
376     count.  */
377  for (;;)
378    {
379      Positions best;
380      unsigned int best_duplicates_count = UINT_MAX;
381
382      for (int i = imax; i >= -1; i--)
383        if (current.contains (i) && !mandatory.contains (i))
384          {
385            Positions tryal = current;
386            tryal.remove (i);
387            unsigned int try_duplicates_count =
388              count_duplicates_tuple (tryal, alpha_unify);
389
390            /* We prefer 'try' to 'best' if it produces less duplicates,
391               or if it produces the same number of duplicates but with
392               a more efficient hash function.  */
393            if (try_duplicates_count < best_duplicates_count
394                || (try_duplicates_count == best_duplicates_count && i == -1))
395              {
396                best = tryal;
397                best_duplicates_count = try_duplicates_count;
398              }
399          }
400
401      /* Stop removing positions when it gives no improvement.  */
402      if (best_duplicates_count > current_duplicates_count)
403        break;
404
405      current = best;
406      current_duplicates_count = best_duplicates_count;
407    }
408
409  /* 4. Replace two positions by one, as long as this doesn't increase the
410     duplicates count.  */
411  for (;;)
412    {
413      Positions best;
414      unsigned int best_duplicates_count = UINT_MAX;
415
416      for (int i1 = imax; i1 >= -1; i1--)
417        if (current.contains (i1) && !mandatory.contains (i1))
418          for (int i2 = imax; i2 >= -1; i2--)
419            if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
420              for (int i3 = imax; i3 >= 0; i3--)
421                if (!current.contains (i3))
422                  {
423                    Positions tryal = current;
424                    tryal.remove (i1);
425                    tryal.remove (i2);
426                    tryal.add (i3);
427                    unsigned int try_duplicates_count =
428                      count_duplicates_tuple (tryal, alpha_unify);
429
430                    /* We prefer 'try' to 'best' if it produces less duplicates,
431                       or if it produces the same number of duplicates but with
432                       a more efficient hash function.  */
433                    if (try_duplicates_count < best_duplicates_count
434                        || (try_duplicates_count == best_duplicates_count
435                            && (i1 == -1 || i2 == -1 || i3 >= 0)))
436                      {
437                        best = tryal;
438                        best_duplicates_count = try_duplicates_count;
439                      }
440                  }
441
442      /* Stop removing positions when it gives no improvement.  */
443      if (best_duplicates_count > current_duplicates_count)
444        break;
445
446      current = best;
447      current_duplicates_count = best_duplicates_count;
448    }
449
450  /* That's it.  Hope it's good enough.  */
451  _key_positions = current;
452
453  if (option[DEBUG])
454    {
455      /* Print the result.  */
456      fprintf (stderr, "\nComputed positions: ");
457      PositionReverseIterator iter = _key_positions.reviterator();
458      bool seen_lastchar = false;
459      bool first = true;
460      for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
461        {
462          if (!first)
463            fprintf (stderr, ", ");
464          if (i == Positions::LASTCHAR)
465            seen_lastchar = true;
466          else
467            {
468              fprintf (stderr, "%d", i + 1);
469              first = false;
470            }
471        }
472      if (seen_lastchar)
473        {
474          if (!first)
475            fprintf (stderr, ", ");
476          fprintf (stderr, "$");
477        }
478      fprintf (stderr, "\n");
479    }
480
481  /* Free preliminary alpha_unify table.  */
482  delete[] alpha_unify;
483}
484
485/* Count the duplicate keywords that occur with the found set of positions.
486   In other words, it returns the difference
487     # K - # proj1 (K)
488   where K is the multiset of given keywords.  */
489unsigned int
490Search::count_duplicates_tuple () const
491{
492  unsigned int *alpha_unify = compute_alpha_unify ();
493  unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
494  delete[] alpha_unify;
495  return count;
496}
497
498/* ===================== Finding good alpha increments ===================== */
499
500/* Computes the upper bound on the indices passed to asso_values[].  */
501unsigned int
502Search::compute_alpha_size (const unsigned int *alpha_inc) const
503{
504  unsigned int max_alpha_inc = 0;
505  for (int i = 0; i < _max_key_len; i++)
506    if (max_alpha_inc < alpha_inc[i])
507      max_alpha_inc = alpha_inc[i];
508  return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
509}
510
511/* Computes the unification rules between different asso_values[c].  */
512unsigned int *
513Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
514{
515  if (option[UPPERLOWER])
516    {
517      /* Without alpha increments, we would simply unify
518           'A' -> 'a', ..., 'Z' -> 'z'.
519         But when a keyword contains at position i a character c,
520         we have the constraint
521            asso_values[tolower(c) + alpha_inc[i]] ==
522            asso_values[toupper(c) + alpha_inc[i]].
523         This introduces a unification
524           toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
525         Note that this unification can extend outside the range of
526         ASCII letters!  But still every unified character pair is at
527         a distance of 'a'-'A' = 32, or (after chained unification)
528         at a multiple of 32.  So in the end the alpha_unify vector has
529         the form    c -> c + 32 * f(c)   where f(c) is a nonnegative
530         integer.  */
531      unsigned int alpha_size = compute_alpha_size (alpha_inc);
532
533      unsigned int *alpha_unify = new unsigned int[alpha_size];
534      for (unsigned int c = 0; c < alpha_size; c++)
535        alpha_unify[c] = c;
536
537      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
538        {
539          KeywordExt *keyword = temp->first();
540
541          /* Iterate through the selected character positions.  */
542          PositionIterator iter = positions.iterator(keyword->_allchars_length);
543
544          for (int i; (i = iter.next ()) != PositionIterator::EOS; )
545            {
546              unsigned int c;
547              if (i == Positions::LASTCHAR)
548                c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
549              else if (i < keyword->_allchars_length)
550                c = static_cast<unsigned char>(keyword->_allchars[i]);
551              else
552                abort ();
553              if (c >= 'A' && c <= 'Z')
554                c += 'a' - 'A';
555              if (c >= 'a' && c <= 'z')
556                {
557                  if (i != Positions::LASTCHAR)
558                    c += alpha_inc[i];
559                  /* Unify c with c - ('a'-'A').  */
560                  unsigned int d = alpha_unify[c];
561                  unsigned int b = c - ('a'-'A');
562                  for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
563                    alpha_unify[a] = d;
564                }
565            }
566        }
567      return alpha_unify;
568    }
569  else
570    /* Identity mapping.  */
571    return NULL;
572}
573
574/* Initializes each keyword's _selchars array.  */
575void
576Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
577{
578  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
579    temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
580}
581
582/* Count the duplicate keywords that occur with the given set of positions
583   and a given alpha_inc[] array.
584   In other words, it returns the difference
585     # K - # proj2 (proj1 (K))
586   where K is the multiset of given keywords.  */
587unsigned int
588Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
589{
590  /* Run through the keyword list and count the duplicates incrementally.
591     The result does not depend on the order of the keyword list, thanks to
592     the formula above.  */
593  unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
594  init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
595
596  unsigned int count = 0;
597  {
598    Hash_Table representatives (_total_keys, option[NOLENGTH]);
599    for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
600      {
601        KeywordExt *keyword = temp->first();
602        if (representatives.insert (keyword))
603          count++;
604      }
605  }
606
607  delete_selchars ();
608  delete[] alpha_unify;
609
610  return count;
611}
612
613/* Find good _alpha_inc[].  */
614
615void
616Search::find_alpha_inc ()
617{
618  /* The goal is to choose _alpha_inc[] such that it doesn't introduce
619     artificial duplicates.
620     In other words, the goal is  # proj2 (proj1 (K)) = # proj1 (K).  */
621  unsigned int duplicates_goal = count_duplicates_tuple ();
622
623  /* Start with zero increments.  This is sufficient in most cases.  */
624  unsigned int *current = new unsigned int [_max_key_len];
625  for (int i = 0; i < _max_key_len; i++)
626    current[i] = 0;
627  unsigned int current_duplicates_count = count_duplicates_multiset (current);
628
629  if (current_duplicates_count > duplicates_goal)
630    {
631      /* Look which _alpha_inc[i] we are free to increment.  */
632      unsigned int nindices;
633      {
634        nindices = 0;
635        PositionIterator iter = _key_positions.iterator(_max_key_len);
636        for (;;)
637          {
638            int key_pos = iter.next ();
639            if (key_pos == PositionIterator::EOS)
640              break;
641            if (key_pos != Positions::LASTCHAR)
642              nindices++;
643          }
644      }
645
646      DYNAMIC_ARRAY (indices, unsigned int, nindices);
647      {
648        unsigned int j = 0;
649        PositionIterator iter = _key_positions.iterator(_max_key_len);
650        for (;;)
651          {
652            int key_pos = iter.next ();
653            if (key_pos == PositionIterator::EOS)
654              break;
655            if (key_pos != Positions::LASTCHAR)
656              indices[j++] = key_pos;
657          }
658        if (!(j == nindices))
659          abort ();
660      }
661
662      /* Perform several rounds of searching for a good alpha increment.
663         Each round reduces the number of artificial collisions by adding
664         an increment in a single key position.  */
665      DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
666      DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
667      do
668        {
669          /* An increment of 1 is not always enough.  Try higher increments
670             also.  */
671          for (unsigned int inc = 1; ; inc++)
672            {
673              unsigned int best_duplicates_count = UINT_MAX;
674
675              for (unsigned int j = 0; j < nindices; j++)
676                {
677                  memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
678                  tryal[indices[j]] += inc;
679                  unsigned int try_duplicates_count =
680                    count_duplicates_multiset (tryal);
681
682                  /* We prefer 'try' to 'best' if it produces less
683                     duplicates.  */
684                  if (try_duplicates_count < best_duplicates_count)
685                    {
686                      memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
687                      best_duplicates_count = try_duplicates_count;
688                    }
689                }
690
691              /* Stop this round when we got an improvement.  */
692              if (best_duplicates_count < current_duplicates_count)
693                {
694                  memcpy (current, best, _max_key_len * sizeof (unsigned int));
695                  current_duplicates_count = best_duplicates_count;
696                  break;
697                }
698            }
699        }
700      while (current_duplicates_count > duplicates_goal);
701      FREE_DYNAMIC_ARRAY (tryal);
702      FREE_DYNAMIC_ARRAY (best);
703
704      if (option[DEBUG])
705        {
706          /* Print the result.  */
707          fprintf (stderr, "\nComputed alpha increments: ");
708          bool first = true;
709          for (unsigned int j = nindices; j-- > 0; )
710            if (current[indices[j]] != 0)
711              {
712                if (!first)
713                  fprintf (stderr, ", ");
714                fprintf (stderr, "%u:+%u",
715                         indices[j] + 1, current[indices[j]]);
716                first = false;
717              }
718          fprintf (stderr, "\n");
719        }
720      FREE_DYNAMIC_ARRAY (indices);
721    }
722
723  _alpha_inc = current;
724  _alpha_size = compute_alpha_size (_alpha_inc);
725  _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
726}
727
728/* ======================= Finding good asso_values ======================== */
729
730/* Initializes the asso_values[] related parameters.  */
731
732void
733Search::prepare_asso_values ()
734{
735  KeywordExt_List *temp;
736
737  /* Initialize each keyword's _selchars array.  */
738  init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
739
740  /* Compute the maximum _selchars_length over all keywords.  */
741  _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
742
743  /* Check for duplicates, i.e. keywords with the same _selchars array
744     (and - if !option[NOLENGTH] - also the same length).
745     We deal with these by building an equivalence class, so that only
746     1 keyword is representative of the entire collection.  Only this
747     representative remains in the keyword list; the others are accessible
748     through the _duplicate_link chain, starting at the representative.
749     This *greatly* simplifies processing during later stages of the program.
750     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
751  {
752    _list_len = _total_keys;
753    _total_duplicates = 0;
754    /* Make hash table for efficiency.  */
755    Hash_Table representatives (_list_len, option[NOLENGTH]);
756
757    KeywordExt_List *prev = NULL; /* list node before temp */
758    for (temp = _head; temp; )
759      {
760        KeywordExt *keyword = temp->first();
761        KeywordExt *other_keyword = representatives.insert (keyword);
762        KeywordExt_List *garbage = NULL;
763
764        if (other_keyword)
765          {
766            _total_duplicates++;
767            _list_len--;
768            /* Remove keyword from the main list.  */
769            prev->rest() = temp->rest();
770            garbage = temp;
771            /* And insert it on other_keyword's duplicate list.  */
772            keyword->_duplicate_link = other_keyword->_duplicate_link;
773            other_keyword->_duplicate_link = keyword;
774
775            /* Complain if user hasn't enabled the duplicate option. */
776            if (!option[DUP] || option[DEBUG])
777              {
778                fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
779                         keyword->_allchars_length, keyword->_allchars,
780                         other_keyword->_allchars_length, other_keyword->_allchars);
781                for (int j = 0; j < keyword->_selchars_length; j++)
782                  putc (keyword->_selchars[j], stderr);
783                fprintf (stderr, "\".\n");
784              }
785          }
786        else
787          {
788            keyword->_duplicate_link = NULL;
789            prev = temp;
790          }
791        temp = temp->rest();
792        if (garbage)
793          delete garbage;
794      }
795    if (option[DEBUG])
796      representatives.dump();
797  }
798
799  /* Exit program if duplicates exists and option[DUP] not set, since we
800     don't want to continue in this case.  (We don't want to turn on
801     option[DUP] implicitly, because the generated code is usually much
802     slower.  */
803  if (_total_duplicates)
804    {
805      if (option[DUP])
806        fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
807                         _total_duplicates);
808      else
809        {
810          fprintf (stderr, "%d input keys have identical hash values,\n",
811                           _total_duplicates);
812          if (option[POSITIONS])
813            fprintf (stderr, "try different key positions or use option -D.\n");
814          else
815            fprintf (stderr, "use option -D.\n");
816          exit (1);
817        }
818    }
819
820  /* Compute the occurrences of each character in the alphabet.  */
821  _occurrences = new int[_alpha_size];
822  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
823  for (temp = _head; temp; temp = temp->rest())
824    {
825      KeywordExt *keyword = temp->first();
826      const unsigned int *ptr = keyword->_selchars;
827      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
828        _occurrences[*ptr]++;
829    }
830
831  /* Memory allocation.  */
832  _asso_values = new int[_alpha_size];
833
834  int non_linked_length = _list_len;
835  unsigned int asso_value_max;
836
837  asso_value_max =
838    static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
839  /* Round up to the next power of two.  This makes it easy to ensure
840     an _asso_value[c] is >= 0 and < asso_value_max.  Also, the jump value
841     being odd, it guarantees that Search::try_asso_value() will iterate
842     through different values for _asso_value[c].  */
843  if (asso_value_max == 0)
844    asso_value_max = 1;
845  asso_value_max |= asso_value_max >> 1;
846  asso_value_max |= asso_value_max >> 2;
847  asso_value_max |= asso_value_max >> 4;
848  asso_value_max |= asso_value_max >> 8;
849  asso_value_max |= asso_value_max >> 16;
850  asso_value_max++;
851  _asso_value_max = asso_value_max;
852
853  /* Given the bound for _asso_values[c], we have a bound for the possible
854     hash values, as computed in compute_hash().  */
855  _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
856                    + (_asso_value_max - 1) * _max_selchars_length;
857  /* Allocate a sparse bit vector for detection of collisions of hash
858     values.  */
859  _collision_detector = new Bool_Array (_max_hash_value + 1);
860
861  if (option[DEBUG])
862    {
863      fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
864               "\nmaximum size of generated hash table is %d\n",
865               non_linked_length, asso_value_max, _max_hash_value);
866
867      int field_width;
868
869      field_width = 0;
870      {
871        for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
872          {
873            KeywordExt *keyword = temp->first();
874            if (field_width < keyword->_selchars_length)
875              field_width = keyword->_selchars_length;
876          }
877      }
878
879      fprintf (stderr, "\ndumping the keyword list without duplicates\n");
880      fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
881      int i = 0;
882      for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
883        {
884          KeywordExt *keyword = temp->first();
885          fprintf (stderr, "%9d, ", ++i);
886          if (field_width > keyword->_selchars_length)
887            fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
888          for (int j = 0; j < keyword->_selchars_length; j++)
889            putc (keyword->_selchars[j], stderr);
890          fprintf (stderr, ", %.*s\n",
891                   keyword->_allchars_length, keyword->_allchars);
892        }
893      fprintf (stderr, "\nend of keyword list\n\n");
894    }
895
896  if (option[RANDOM] || option.get_jump () == 0)
897    /* We will use rand(), so initialize the random number generator.  */
898    srand (static_cast<long>(time (0)));
899
900  _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
901  _jump = option.get_jump ();
902}
903
904/* Finds some _asso_values[] that fit.  */
905
906/* The idea is to choose the _asso_values[] one by one, in a way that
907   a choice that has been made never needs to be undone later.  This
908   means that we split the work into several steps.  Each step chooses
909   one or more _asso_values[c].  The result of choosing one or more
910   _asso_values[c] is that the partitioning of the keyword set gets
911   broader.
912   Look at this partitioning:  After every step, the _asso_values[] of a
913   certain set C of characters are undetermined.  (At the beginning, C
914   is the set of characters c with _occurrences[c] > 0.  At the end, C
915   is empty.)  To each keyword K, we associate the multiset of _selchars
916   for which the _asso_values[] are undetermined:
917                    K  -->  K->_selchars intersect C.
918   Consider two keywords equivalent if their value under this mapping is
919   the same.  This introduces an equivalence relation on the set of
920   keywords.  The equivalence classes partition the keyword set.  (At the
921   beginning, the partition is the finest possible: each K is an equivalence
922   class by itself, because all K have a different _selchars.  At the end,
923   all K have been merged into a single equivalence class.)
924   The partition before a step is always a refinement of the partition
925   after the step.
926   We choose the steps in such a way that the partition really becomes
927   broader at each step.  (A step that only chooses an _asso_values[c]
928   without changing the partition is better merged with the previous step,
929   to avoid useless backtracking.)  */
930
931struct EquivalenceClass
932{
933  /* The keywords in this equivalence class.  */
934  KeywordExt_List *     _keywords;
935  KeywordExt_List *     _keywords_last;
936  /* The number of keywords in this equivalence class.  */
937  unsigned int          _cardinality;
938  /* The undetermined selected characters for the keywords in this
939     equivalence class, as a canonically reordered multiset.  */
940  unsigned int *        _undetermined_chars;
941  unsigned int          _undetermined_chars_length;
942
943  EquivalenceClass *    _next;
944};
945
946struct Step
947{
948  /* The characters whose values are being determined in this step.  */
949  unsigned int          _changing_count;
950  unsigned int *        _changing;
951  /* Exclusive upper bound for the _asso_values[c] of this step.
952     A power of 2.  */
953  unsigned int          _asso_value_max;
954  /* The characters whose values will be determined after this step.  */
955  bool *                _undetermined;
956  /* The keyword set partition after this step.  */
957  EquivalenceClass *    _partition;
958  /* The expected number of iterations in this step.  */
959  double                _expected_lower;
960  double                _expected_upper;
961
962  Step *                _next;
963};
964
965static inline bool
966equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
967{
968  while (len > 0)
969    {
970      if (*ptr1 != *ptr2)
971        return false;
972      ptr1++;
973      ptr2++;
974      len--;
975    }
976  return true;
977}
978
979EquivalenceClass *
980Search::compute_partition (bool *undetermined) const
981{
982  EquivalenceClass *partition = NULL;
983  EquivalenceClass *partition_last = NULL;
984  for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
985    {
986      KeywordExt *keyword = temp->first();
987
988      /* Compute the undetermined characters for this keyword.  */
989      unsigned int *undetermined_chars =
990        new unsigned int[keyword->_selchars_length];
991      unsigned int undetermined_chars_length = 0;
992
993      for (int i = 0; i < keyword->_selchars_length; i++)
994        if (undetermined[keyword->_selchars[i]])
995          undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
996
997      /* Look up the equivalence class to which this keyword belongs.  */
998      EquivalenceClass *equclass;
999      for (equclass = partition; equclass; equclass = equclass->_next)
1000        if (equclass->_undetermined_chars_length == undetermined_chars_length
1001            && equals (equclass->_undetermined_chars, undetermined_chars,
1002                       undetermined_chars_length))
1003          break;
1004      if (equclass == NULL)
1005        {
1006          equclass = new EquivalenceClass();
1007          equclass->_keywords = NULL;
1008          equclass->_keywords_last = NULL;
1009          equclass->_cardinality = 0;
1010          equclass->_undetermined_chars = undetermined_chars;
1011          equclass->_undetermined_chars_length = undetermined_chars_length;
1012          equclass->_next = NULL;
1013          if (partition)
1014            partition_last->_next = equclass;
1015          else
1016            partition = equclass;
1017          partition_last = equclass;
1018        }
1019      else
1020        delete[] undetermined_chars;
1021
1022      /* Add the keyword to the equivalence class.  */
1023      KeywordExt_List *cons = new KeywordExt_List(keyword);
1024      if (equclass->_keywords)
1025        equclass->_keywords_last->rest() = cons;
1026      else
1027        equclass->_keywords = cons;
1028      equclass->_keywords_last = cons;
1029      equclass->_cardinality++;
1030    }
1031
1032  /* Free some of the allocated memory.  The caller doesn't need it.  */
1033  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1034    delete[] cls->_undetermined_chars;
1035
1036  return partition;
1037}
1038
1039static void
1040delete_partition (EquivalenceClass *partition)
1041{
1042  while (partition != NULL)
1043    {
1044      EquivalenceClass *equclass = partition;
1045      partition = equclass->_next;
1046      delete_list (equclass->_keywords);
1047      //delete[] equclass->_undetermined_chars; // already freed above
1048      delete equclass;
1049    }
1050}
1051
1052/* Compute the possible number of collisions when _asso_values[c] is
1053   chosen, leading to the given partition.  */
1054unsigned int
1055Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1056{
1057  /* Every equivalence class p is split according to the frequency of
1058     occurrence of c, leading to equivalence classes p1, p2, ...
1059     This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1060     Return the sum of this expression over all equivalence classes.  */
1061  unsigned int sum = 0;
1062  unsigned int m = _max_selchars_length;
1063  DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1064  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1065    {
1066      for (unsigned int i = 0; i <= m; i++)
1067        split_cardinalities[i] = 0;
1068
1069      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1070        {
1071          KeywordExt *keyword = temp->first();
1072
1073          unsigned int count = 0;
1074          for (int i = 0; i < keyword->_selchars_length; i++)
1075            if (keyword->_selchars[i] == c)
1076              count++;
1077
1078          split_cardinalities[count]++;
1079        }
1080
1081      sum += cls->_cardinality * cls->_cardinality;
1082      for (unsigned int i = 0; i <= m; i++)
1083        sum -= split_cardinalities[i] * split_cardinalities[i];
1084    }
1085  FREE_DYNAMIC_ARRAY (split_cardinalities);
1086  return sum;
1087}
1088
1089/* Test whether adding c to the undetermined characters changes the given
1090   partition.  */
1091bool
1092Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1093{
1094  for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1095    {
1096      unsigned int first_count = UINT_MAX;
1097
1098      for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1099        {
1100          KeywordExt *keyword = temp->first();
1101
1102          unsigned int count = 0;
1103          for (int i = 0; i < keyword->_selchars_length; i++)
1104            if (keyword->_selchars[i] == c)
1105              count++;
1106
1107          if (temp == cls->_keywords)
1108            first_count = count;
1109          else if (count != first_count)
1110            /* c would split this equivalence class.  */
1111            return false;
1112        }
1113    }
1114  return true;
1115}
1116
1117void
1118Search::find_asso_values ()
1119{
1120  Step *steps;
1121
1122  /* Determine the steps, starting with the last one.  */
1123  {
1124    bool *undetermined;
1125    bool *determined;
1126
1127    steps = NULL;
1128
1129    undetermined = new bool[_alpha_size];
1130    for (unsigned int c = 0; c < _alpha_size; c++)
1131      undetermined[c] = false;
1132
1133    determined = new bool[_alpha_size];
1134    for (unsigned int c = 0; c < _alpha_size; c++)
1135      determined[c] = true;
1136
1137    for (;;)
1138      {
1139        /* Compute the partition that needs to be refined.  */
1140        EquivalenceClass *partition = compute_partition (undetermined);
1141
1142        /* Determine the main character to be chosen in this step.
1143           Choosing such a character c has the effect of splitting every
1144           equivalence class (according the the frequency of occurrence of c).
1145           We choose the c with the minimum number of possible collisions,
1146           so that characters which lead to a large number of collisions get
1147           handled early during the search.  */
1148        unsigned int chosen_c;
1149        unsigned int chosen_possible_collisions;
1150        {
1151          unsigned int best_c = 0;
1152          unsigned int best_possible_collisions = UINT_MAX;
1153          for (unsigned int c = 0; c < _alpha_size; c++)
1154            if (_occurrences[c] > 0 && determined[c])
1155              {
1156                unsigned int possible_collisions =
1157                  count_possible_collisions (partition, c);
1158                if (possible_collisions < best_possible_collisions)
1159                  {
1160                    best_c = c;
1161                    best_possible_collisions = possible_collisions;
1162                  }
1163              }
1164          if (best_possible_collisions == UINT_MAX)
1165            {
1166              /* All c with _occurrences[c] > 0 are undetermined.  We are
1167                 are the starting situation and don't need any more step.  */
1168              delete_partition (partition);
1169              break;
1170            }
1171          chosen_c = best_c;
1172          chosen_possible_collisions = best_possible_collisions;
1173        }
1174
1175        /* We need one more step.  */
1176        Step *step = new Step();
1177
1178        step->_undetermined = new bool[_alpha_size];
1179        memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1180
1181        step->_partition = partition;
1182
1183        /* Now determine how the equivalence classes will be before this
1184           step.  */
1185        undetermined[chosen_c] = true;
1186        partition = compute_partition (undetermined);
1187
1188        /* Now determine which other characters should be determined in this
1189           step, because they will not change the equivalence classes at
1190           this point.  It is the set of all c which, for all equivalence
1191           classes, have the same frequency of occurrence in every keyword
1192           of the equivalence class.  */
1193        for (unsigned int c = 0; c < _alpha_size; c++)
1194          if (_occurrences[c] > 0 && determined[c]
1195              && unchanged_partition (partition, c))
1196            {
1197              undetermined[c] = true;
1198              determined[c] = false;
1199            }
1200
1201        /* main_c must be one of these.  */
1202        if (determined[chosen_c])
1203          abort ();
1204
1205        /* Now the set of changing characters of this step.  */
1206        unsigned int changing_count;
1207
1208        changing_count = 0;
1209        for (unsigned int c = 0; c < _alpha_size; c++)
1210          if (undetermined[c] && !step->_undetermined[c])
1211            changing_count++;
1212
1213        unsigned int *changing = new unsigned int[changing_count];
1214        changing_count = 0;
1215        for (unsigned int c = 0; c < _alpha_size; c++)
1216          if (undetermined[c] && !step->_undetermined[c])
1217            changing[changing_count++] = c;
1218
1219        step->_changing = changing;
1220        step->_changing_count = changing_count;
1221
1222        step->_asso_value_max = _asso_value_max;
1223
1224        step->_expected_lower =
1225          exp (static_cast<double>(chosen_possible_collisions)
1226               / static_cast<double>(_max_hash_value));
1227        step->_expected_upper =
1228          exp (static_cast<double>(chosen_possible_collisions)
1229               / static_cast<double>(_asso_value_max));
1230
1231        delete_partition (partition);
1232
1233        step->_next = steps;
1234        steps = step;
1235      }
1236
1237    delete[] determined;
1238    delete[] undetermined;
1239  }
1240
1241  if (option[DEBUG])
1242    {
1243      unsigned int stepno = 0;
1244      for (Step *step = steps; step; step = step->_next)
1245        {
1246          stepno++;
1247          fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1248          for (unsigned int i = 0; i < step->_changing_count; i++)
1249            {
1250              if (i > 0)
1251                fprintf (stderr, ",");
1252              fprintf (stderr, "'%c'", step->_changing[i]);
1253            }
1254          fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1255                   step->_expected_lower, step->_expected_upper);
1256          fprintf (stderr, "Keyword equivalence classes:\n");
1257          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1258            {
1259              fprintf (stderr, "\n");
1260              for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1261                {
1262                  KeywordExt *keyword = temp->first();
1263                  fprintf (stderr, "  %.*s\n",
1264                           keyword->_allchars_length, keyword->_allchars);
1265                }
1266            }
1267          fprintf (stderr, "\n");
1268        }
1269    }
1270
1271  /* Initialize _asso_values[].  (The value given here matters only
1272     for those c which occur in all keywords with equal multiplicity.)  */
1273  for (unsigned int c = 0; c < _alpha_size; c++)
1274    _asso_values[c] = 0;
1275
1276  unsigned int stepno = 0;
1277  for (Step *step = steps; step; step = step->_next)
1278    {
1279      stepno++;
1280
1281      /* Initialize the asso_values[].  */
1282      unsigned int k = step->_changing_count;
1283      for (unsigned int i = 0; i < k; i++)
1284        {
1285          unsigned int c = step->_changing[i];
1286          _asso_values[c] =
1287            (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1288            & (step->_asso_value_max - 1);
1289        }
1290
1291      unsigned int iterations = 0;
1292      DYNAMIC_ARRAY (iter, unsigned int, k);
1293      for (unsigned int i = 0; i < k; i++)
1294        iter[i] = 0;
1295      unsigned int ii = (_jump != 0 ? k - 1 : 0);
1296
1297      for (;;)
1298        {
1299          /* Test whether these asso_values[] lead to collisions among
1300             the equivalence classes that should be collision-free.  */
1301          bool has_collision = false;
1302          for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1303            {
1304              /* Iteration Number array is a win, O(1) initialization time!  */
1305              _collision_detector->clear ();
1306
1307              for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1308                {
1309                  KeywordExt *keyword = ptr->first();
1310
1311                  /* Compute the new hash code for the keyword, leaving apart
1312                     the yet undetermined asso_values[].  */
1313                  int hashcode;
1314                  {
1315                    int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1316                    const unsigned int *p = keyword->_selchars;
1317                    int i = keyword->_selchars_length;
1318                    for (; i > 0; p++, i--)
1319                      if (!step->_undetermined[*p])
1320                        sum += _asso_values[*p];
1321                    hashcode = sum;
1322                  }
1323
1324                  /* See whether it collides with another keyword's hash code,
1325                     from the same equivalence class.  */
1326                  if (_collision_detector->set_bit (hashcode))
1327                    {
1328                      has_collision = true;
1329                      break;
1330                    }
1331                }
1332
1333              /* Don't need to continue looking at the other equivalence
1334                 classes if we already have found a collision.  */
1335              if (has_collision)
1336                break;
1337            }
1338
1339          iterations++;
1340          if (!has_collision)
1341            break;
1342
1343          /* Try other asso_values[].  */
1344          if (_jump != 0)
1345            {
1346              /* The way we try various values for
1347                   asso_values[step->_changing[0],...step->_changing[k-1]]
1348                 is like this:
1349                 for (bound = 0,1,...)
1350                   for (ii = 0,...,k-1)
1351                     iter[ii] := bound
1352                     iter[0..ii-1] := values <= bound
1353                     iter[ii+1..k-1] := values < bound
1354                 and
1355                   asso_values[step->_changing[i]] =
1356                     _initial_asso_value + iter[i] * _jump.
1357                 This makes it more likely to find small asso_values[].
1358               */
1359              unsigned int bound = iter[ii];
1360              unsigned int i = 0;
1361              while (i < ii)
1362                {
1363                  unsigned int c = step->_changing[i];
1364                  iter[i]++;
1365                  _asso_values[c] =
1366                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1367                  if (iter[i] <= bound)
1368                    goto found_next;
1369                  _asso_values[c] =
1370                    (_asso_values[c] - iter[i] * _jump)
1371                    & (step->_asso_value_max - 1);
1372                  iter[i] = 0;
1373                  i++;
1374                }
1375              i = ii + 1;
1376              while (i < k)
1377                {
1378                  unsigned int c = step->_changing[i];
1379                  iter[i]++;
1380                  _asso_values[c] =
1381                    (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1382                  if (iter[i] < bound)
1383                    goto found_next;
1384                  _asso_values[c] =
1385                    (_asso_values[c] - iter[i] * _jump)
1386                    & (step->_asso_value_max - 1);
1387                  iter[i] = 0;
1388                  i++;
1389                }
1390              /* Switch from one ii to the next.  */
1391              {
1392                unsigned int c = step->_changing[ii];
1393                _asso_values[c] =
1394                  (_asso_values[c] - bound * _jump)
1395                  & (step->_asso_value_max - 1);
1396                iter[ii] = 0;
1397              }
1398              /* Here all iter[i] == 0.  */
1399              ii++;
1400              if (ii == k)
1401                {
1402                  ii = 0;
1403                  bound++;
1404                  if (bound == step->_asso_value_max)
1405                    {
1406                      /* Out of search space!  We can either backtrack, or
1407                         increase the available search space of this step.
1408                         It seems simpler to choose the latter solution.  */
1409                      step->_asso_value_max = 2 * step->_asso_value_max;
1410                      if (step->_asso_value_max > _asso_value_max)
1411                        {
1412                          _asso_value_max = step->_asso_value_max;
1413                          /* Reinitialize _max_hash_value.  */
1414                          _max_hash_value =
1415                            (option[NOLENGTH] ? 0 : _max_key_len)
1416                            + (_asso_value_max - 1) * _max_selchars_length;
1417                          /* Reinitialize _collision_detector.  */
1418                          delete _collision_detector;
1419                          _collision_detector =
1420                            new Bool_Array (_max_hash_value + 1);
1421                        }
1422                    }
1423                }
1424              {
1425                unsigned int c = step->_changing[ii];
1426                iter[ii] = bound;
1427                _asso_values[c] =
1428                  (_asso_values[c] + bound * _jump)
1429                  & (step->_asso_value_max - 1);
1430              }
1431             found_next: ;
1432            }
1433          else
1434            {
1435              /* Random.  */
1436              unsigned int c = step->_changing[ii];
1437              _asso_values[c] =
1438                (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1439              /* Next time, change the next c.  */
1440              ii++;
1441              if (ii == k)
1442                ii = 0;
1443            }
1444        }
1445      FREE_DYNAMIC_ARRAY (iter);
1446
1447      if (option[DEBUG])
1448        {
1449          fprintf (stderr, "Step %u chose _asso_values[", stepno);
1450          for (unsigned int i = 0; i < step->_changing_count; i++)
1451            {
1452              if (i > 0)
1453                fprintf (stderr, ",");
1454              fprintf (stderr, "'%c'", step->_changing[i]);
1455            }
1456          fprintf (stderr, "] in %u iterations.\n", iterations);
1457        }
1458    }
1459
1460  /* Free allocated memory.  */
1461  while (steps != NULL)
1462    {
1463      Step *step = steps;
1464      steps = step->_next;
1465      delete[] step->_changing;
1466      delete[] step->_undetermined;
1467      delete_partition (step->_partition);
1468      delete step;
1469    }
1470}
1471
1472/* Computes a keyword's hash value, relative to the current _asso_values[],
1473   and stores it in keyword->_hash_value.  */
1474
1475inline int
1476Search::compute_hash (KeywordExt *keyword) const
1477{
1478  int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1479
1480  const unsigned int *p = keyword->_selchars;
1481  int i = keyword->_selchars_length;
1482  for (; i > 0; p++, i--)
1483    sum += _asso_values[*p];
1484
1485  return keyword->_hash_value = sum;
1486}
1487
1488/* Finds good _asso_values[].  */
1489
1490void
1491Search::find_good_asso_values ()
1492{
1493  prepare_asso_values ();
1494
1495  /* Search for good _asso_values[].  */
1496  int asso_iteration;
1497  if ((asso_iteration = option.get_asso_iterations ()) == 0)
1498    /* Try only the given _initial_asso_value and _jump.  */
1499    find_asso_values ();
1500  else
1501    {
1502      /* Try different pairs of _initial_asso_value and _jump, in the
1503         following order:
1504           (0, 1)
1505           (1, 1)
1506           (2, 1) (0, 3)
1507           (3, 1) (1, 3)
1508           (4, 1) (2, 3) (0, 5)
1509           (5, 1) (3, 3) (1, 5)
1510           ..... */
1511      KeywordExt_List *saved_head = _head;
1512      int best_initial_asso_value = 0;
1513      int best_jump = 1;
1514      int *best_asso_values = new int[_alpha_size];
1515      int best_collisions = INT_MAX;
1516      int best_max_hash_value = INT_MAX;
1517
1518      _initial_asso_value = 0; _jump = 1;
1519      for (;;)
1520        {
1521          /* Restore the keyword list in its original order.  */
1522          _head = copy_list (saved_head);
1523          /* Find good _asso_values[].  */
1524          find_asso_values ();
1525          /* Test whether it is the best solution so far.  */
1526          int collisions = 0;
1527          int max_hash_value = INT_MIN;
1528          _collision_detector->clear ();
1529          for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1530            {
1531              KeywordExt *keyword = ptr->first();
1532              int hashcode = compute_hash (keyword);
1533              if (max_hash_value < hashcode)
1534                max_hash_value = hashcode;
1535              if (_collision_detector->set_bit (hashcode))
1536                collisions++;
1537            }
1538          if (collisions < best_collisions
1539              || (collisions == best_collisions
1540                  && max_hash_value < best_max_hash_value))
1541            {
1542              memcpy (best_asso_values, _asso_values,
1543                      _alpha_size * sizeof (_asso_values[0]));
1544              best_collisions = collisions;
1545              best_max_hash_value = max_hash_value;
1546            }
1547          /* Delete the copied keyword list.  */
1548          delete_list (_head);
1549
1550          if (--asso_iteration == 0)
1551            break;
1552          /* Prepare for next iteration.  */
1553          if (_initial_asso_value >= 2)
1554            _initial_asso_value -= 2, _jump += 2;
1555          else
1556            _initial_asso_value += _jump, _jump = 1;
1557        }
1558      _head = saved_head;
1559      /* Install the best found asso_values.  */
1560      _initial_asso_value = best_initial_asso_value;
1561      _jump = best_jump;
1562      memcpy (_asso_values, best_asso_values,
1563              _alpha_size * sizeof (_asso_values[0]));
1564      delete[] best_asso_values;
1565      /* The keywords' _hash_value fields are recomputed below.  */
1566    }
1567}
1568
1569/* ========================================================================= */
1570
1571/* Comparison function for sorting by increasing _hash_value.  */
1572static bool
1573less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1574{
1575  return keyword1->_hash_value < keyword2->_hash_value;
1576}
1577
1578/* Sorts the keyword list by hash value.  */
1579
1580void
1581Search::sort ()
1582{
1583  _head = mergesort_list (_head, less_by_hash_value);
1584}
1585
1586void
1587Search::optimize ()
1588{
1589  /* Preparations.  */
1590  prepare ();
1591
1592  /* Step 1: Finding good byte positions.  */
1593  find_positions ();
1594
1595  /* Step 2: Finding good alpha increments.  */
1596  find_alpha_inc ();
1597
1598  /* Step 3: Finding good asso_values.  */
1599  find_good_asso_values ();
1600
1601  /* Make one final check, just to make sure nothing weird happened.... */
1602  _collision_detector->clear ();
1603  for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1604    {
1605      KeywordExt *curr = curr_ptr->first();
1606      unsigned int hashcode = compute_hash (curr);
1607      if (_collision_detector->set_bit (hashcode))
1608        {
1609          /* This shouldn't happen.  proj1, proj2, proj3 must have been
1610             computed to be injective on the given keyword set.  */
1611          fprintf (stderr,
1612                   "\nInternal error, unexpected duplicate hash code\n");
1613          if (option[POSITIONS])
1614            fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1615          else
1616            fprintf (stderr, "try options -m or -r.\n\n");
1617          exit (1);
1618        }
1619    }
1620
1621  /* Sorts the keyword list by hash value.  */
1622  sort ();
1623
1624  /* Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely
1625     necessary, but speeds up the lookup function in many cases of lookup
1626     failure: no string comparison is needed once the hash value of a string
1627     is larger than the hash value of any keyword.  */
1628  int max_hash_value;
1629  {
1630    KeywordExt_List *temp;
1631    for (temp = _head; temp->rest(); temp = temp->rest())
1632      ;
1633    max_hash_value = temp->first()->_hash_value;
1634  }
1635  for (unsigned int c = 0; c < _alpha_size; c++)
1636    if (_occurrences[c] == 0)
1637      _asso_values[c] = max_hash_value + 1;
1638
1639  /* Propagate unified asso_values.  */
1640  if (_alpha_unify)
1641    for (unsigned int c = 0; c < _alpha_size; c++)
1642      if (_alpha_unify[c] != c)
1643        _asso_values[c] = _asso_values[_alpha_unify[c]];
1644}
1645
1646/* Prints out some diagnostics upon completion.  */
1647
1648Search::~Search ()
1649{
1650  delete _collision_detector;
1651  if (option[DEBUG])
1652    {
1653      fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1654
1655      for (unsigned int i = 0; i < _alpha_size; i++)
1656        if (_occurrences[i])
1657          fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1658                   i, _asso_values[i], i, _occurrences[i]);
1659
1660      fprintf (stderr, "end table dumping\n");
1661
1662      fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1663               "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1664               _list_len, _total_keys, _total_duplicates, _max_key_len);
1665
1666      int field_width = _max_selchars_length;
1667      fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1668               field_width, "selchars");
1669      for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1670        {
1671          fprintf (stderr, "%11d,%11d,%6d, ",
1672                   ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1673          if (field_width > ptr->first()->_selchars_length)
1674            fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1675          for (int j = 0; j < ptr->first()->_selchars_length; j++)
1676            putc (ptr->first()->_selchars[j], stderr);
1677          fprintf (stderr, ", %.*s\n",
1678                   ptr->first()->_allchars_length, ptr->first()->_allchars);
1679        }
1680
1681      fprintf (stderr, "End dumping list.\n\n");
1682    }
1683  delete[] _asso_values;
1684  delete[] _occurrences;
1685  delete[] _alpha_unify;
1686  delete[] _alpha_inc;
1687}
1688