1228060Sbapt/* Search algorithm. 2228060Sbapt Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc. 3228060Sbapt Written by Douglas C. Schmidt <schmidt@ics.uci.edu> 4228060Sbapt and Bruno Haible <bruno@clisp.org>. 5228060Sbapt 6228060Sbapt This file is part of GNU GPERF. 7228060Sbapt 8228060Sbapt GNU GPERF is free software; you can redistribute it and/or modify 9228060Sbapt it under the terms of the GNU General Public License as published by 10228060Sbapt the Free Software Foundation; either version 2, or (at your option) 11228060Sbapt any later version. 12228060Sbapt 13228060Sbapt GNU GPERF is distributed in the hope that it will be useful, 14228060Sbapt but WITHOUT ANY WARRANTY; without even the implied warranty of 15228060Sbapt MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16228060Sbapt GNU General Public License for more details. 17228060Sbapt 18228060Sbapt You should have received a copy of the GNU General Public License 19228060Sbapt along with this program; see the file COPYING. 20228060Sbapt If not, write to the Free Software Foundation, Inc., 21228060Sbapt 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22228060Sbapt 23228060Sbapt/* Specification. */ 24228060Sbapt#include "search.h" 25228060Sbapt 26228060Sbapt#include <stdio.h> 27228060Sbapt#include <stdlib.h> /* declares exit(), rand(), srand() */ 28228060Sbapt#include <string.h> /* declares memset(), memcmp() */ 29228060Sbapt#include <time.h> /* declares time() */ 30228060Sbapt#include <math.h> /* declares exp() */ 31228060Sbapt#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ 32228060Sbapt#include "options.h" 33228060Sbapt#include "hash-table.h" 34228060Sbapt#include "config.h" 35228060Sbapt 36228060Sbapt/* ============================== Portability ============================== */ 37228060Sbapt 38228060Sbapt/* Assume ISO C++ 'for' scoping rule. */ 39228060Sbapt#define for if (0) ; else for 40228060Sbapt 41228060Sbapt/* Dynamically allocated array with dynamic extent: 42228060Sbapt 43228060Sbapt Example: 44228060Sbapt DYNAMIC_ARRAY (my_array, int, n); 45228060Sbapt ... 46228060Sbapt FREE_DYNAMIC_ARRAY (my_array); 47228060Sbapt 48228060Sbapt Attention: depending on your implementation my_array is either the array 49228060Sbapt itself or a pointer to the array! Always use my_array only as expression! 50228060Sbapt */ 51228060Sbapt#if HAVE_DYNAMIC_ARRAY 52228060Sbapt #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size] 53228060Sbapt #define FREE_DYNAMIC_ARRAY(var) 54228060Sbapt#else 55228060Sbapt #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size] 56228060Sbapt #define FREE_DYNAMIC_ARRAY(var) delete[] var 57228060Sbapt#endif 58228060Sbapt 59228060Sbapt/* ================================ Theory ================================= */ 60228060Sbapt 61228060Sbapt/* The general form of the hash function is 62228060Sbapt 63228060Sbapt hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) 64228060Sbapt + len (keyword) 65228060Sbapt 66228060Sbapt where Pos is a set of byte positions, 67228060Sbapt each alpha_inc[i] is a nonnegative integer, 68228060Sbapt each asso_values[c] is a nonnegative integer, 69228060Sbapt len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise. 70228060Sbapt 71228060Sbapt Theorem 1: If all keywords are different, there is a set Pos such that 72228060Sbapt all tuples (keyword[i] : i in Pos) are different. 73228060Sbapt 74228060Sbapt Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there 75228060Sbapt are nonnegative integers alpha_inc[i] such that all multisets 76228060Sbapt {keyword[i] + alpha_inc[i] : i in Pos} are different. 77228060Sbapt 78228060Sbapt Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. 79228060Sbapt 80228060Sbapt Theorem 3: If all multisets selchars[keyword] are different, there are 81228060Sbapt nonnegative integers asso_values[c] such that all hash values 82228060Sbapt sum (asso_values[c] : c in selchars[keyword]) are different. 83228060Sbapt 84228060Sbapt Based on these three facts, we find the hash function in three steps: 85228060Sbapt 86228060Sbapt Step 1 (Finding good byte positions): 87228060Sbapt Find a set Pos, as small as possible, such that all tuples 88228060Sbapt (keyword[i] : i in Pos) are different. 89228060Sbapt 90228060Sbapt Step 2 (Finding good alpha increments): 91228060Sbapt Find nonnegative integers alpha_inc[i], as many of them as possible being 92228060Sbapt zero, and the others being as small as possible, such that all multisets 93228060Sbapt {keyword[i] + alpha_inc[i] : i in Pos} are different. 94228060Sbapt 95228060Sbapt Step 3 (Finding good asso_values): 96228060Sbapt Find asso_values[c] such that all hash (keyword) are different. 97228060Sbapt 98228060Sbapt In other words, each step finds a projection that is injective on the 99228060Sbapt given finite set: 100228060Sbapt proj1 : String --> Map (Pos --> N) 101228060Sbapt proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) 102228060Sbapt proj3 : Map (Pos --> N) / S(Pos) --> N 103228060Sbapt where 104228060Sbapt N denotes the set of nonnegative integers, 105228060Sbapt Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and 106228060Sbapt S(Pos) is the symmetric group over Pos. 107228060Sbapt 108228060Sbapt This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight 109228060Sbapt modifications apply: 110228060Sbapt proj1 : String --> Map (Pos --> N) x N 111228060Sbapt proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N 112228060Sbapt proj3 : Map (Pos --> N) / S(Pos) x N --> N 113228060Sbapt 114228060Sbapt For a case-insensitive hash function, the general form is 115228060Sbapt 116228060Sbapt hash (keyword) = 117228060Sbapt sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) 118228060Sbapt + len (keyword) 119228060Sbapt 120228060Sbapt where alpha_unify[c] is chosen so that an upper/lower case change in 121228060Sbapt keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. 122228060Sbapt */ 123228060Sbapt 124228060Sbapt/* ==================== Initialization and Preparation ===================== */ 125228060Sbapt 126228060SbaptSearch::Search (KeywordExt_List *list) 127228060Sbapt : _head (list) 128228060Sbapt{ 129228060Sbapt} 130228060Sbapt 131228060Sbaptvoid 132228060SbaptSearch::prepare () 133228060Sbapt{ 134228060Sbapt /* Compute the total number of keywords. */ 135228060Sbapt _total_keys = 0; 136228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 137228060Sbapt _total_keys++; 138228060Sbapt 139228060Sbapt /* Compute the minimum and maximum keyword length. */ 140228060Sbapt _max_key_len = INT_MIN; 141228060Sbapt _min_key_len = INT_MAX; 142228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 143228060Sbapt { 144228060Sbapt KeywordExt *keyword = temp->first(); 145228060Sbapt 146228060Sbapt if (_max_key_len < keyword->_allchars_length) 147228060Sbapt _max_key_len = keyword->_allchars_length; 148228060Sbapt if (_min_key_len > keyword->_allchars_length) 149228060Sbapt _min_key_len = keyword->_allchars_length; 150228060Sbapt } 151228060Sbapt 152228060Sbapt /* Exit program if an empty string is used as keyword, since the comparison 153228060Sbapt expressions don't work correctly for looking up an empty string. */ 154228060Sbapt if (_min_key_len == 0) 155228060Sbapt { 156228060Sbapt fprintf (stderr, "Empty input keyword is not allowed.\n" 157228060Sbapt "To recognize an empty input keyword, your code should check for\n" 158228060Sbapt "len == 0 before calling the gperf generated lookup function.\n"); 159228060Sbapt exit (1); 160228060Sbapt } 161228060Sbapt 162228060Sbapt /* Exit program if the characters in the keywords are not in the required 163228060Sbapt range. */ 164228060Sbapt if (option[SEVENBIT]) 165228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 166228060Sbapt { 167228060Sbapt KeywordExt *keyword = temp->first(); 168228060Sbapt 169228060Sbapt const char *k = keyword->_allchars; 170228060Sbapt for (int i = keyword->_allchars_length; i > 0; k++, i--) 171228060Sbapt if (!(static_cast<unsigned char>(*k) < 128)) 172228060Sbapt { 173228060Sbapt fprintf (stderr, "Option --seven-bit has been specified,\n" 174228060Sbapt "but keyword \"%.*s\" contains non-ASCII characters.\n" 175228060Sbapt "Try removing option --seven-bit.\n", 176228060Sbapt keyword->_allchars_length, keyword->_allchars); 177228060Sbapt exit (1); 178228060Sbapt } 179228060Sbapt } 180228060Sbapt} 181228060Sbapt 182228060Sbapt/* ====================== Finding good byte positions ====================== */ 183228060Sbapt 184228060Sbapt/* Computes the upper bound on the indices passed to asso_values[], 185228060Sbapt assuming no alpha_increments. */ 186228060Sbaptunsigned int 187228060SbaptSearch::compute_alpha_size () const 188228060Sbapt{ 189228060Sbapt return (option[SEVENBIT] ? 128 : 256); 190228060Sbapt} 191228060Sbapt 192228060Sbapt/* Computes the unification rules between different asso_values[c], 193228060Sbapt assuming no alpha_increments. */ 194228060Sbaptunsigned int * 195228060SbaptSearch::compute_alpha_unify () const 196228060Sbapt{ 197228060Sbapt if (option[UPPERLOWER]) 198228060Sbapt { 199228060Sbapt /* Uppercase to lowercase mapping. */ 200228060Sbapt unsigned int alpha_size = compute_alpha_size(); 201228060Sbapt unsigned int *alpha_unify = new unsigned int[alpha_size]; 202228060Sbapt for (unsigned int c = 0; c < alpha_size; c++) 203228060Sbapt alpha_unify[c] = c; 204228060Sbapt for (unsigned int c = 'A'; c <= 'Z'; c++) 205228060Sbapt alpha_unify[c] = c + ('a'-'A'); 206228060Sbapt return alpha_unify; 207228060Sbapt } 208228060Sbapt else 209228060Sbapt /* Identity mapping. */ 210228060Sbapt return NULL; 211228060Sbapt} 212228060Sbapt 213228060Sbapt/* Initializes each keyword's _selchars array. */ 214228060Sbaptvoid 215228060SbaptSearch::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const 216228060Sbapt{ 217228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 218228060Sbapt temp->first()->init_selchars_tuple(positions, alpha_unify); 219228060Sbapt} 220228060Sbapt 221228060Sbapt/* Deletes each keyword's _selchars array. */ 222228060Sbaptvoid 223228060SbaptSearch::delete_selchars () const 224228060Sbapt{ 225228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 226228060Sbapt temp->first()->delete_selchars(); 227228060Sbapt} 228228060Sbapt 229228060Sbapt/* Count the duplicate keywords that occur with a given set of positions. 230228060Sbapt In other words, it returns the difference 231228060Sbapt # K - # proj1 (K) 232228060Sbapt where K is the multiset of given keywords. */ 233228060Sbaptunsigned int 234228060SbaptSearch::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const 235228060Sbapt{ 236228060Sbapt /* Run through the keyword list and count the duplicates incrementally. 237228060Sbapt The result does not depend on the order of the keyword list, thanks to 238228060Sbapt the formula above. */ 239228060Sbapt init_selchars_tuple (positions, alpha_unify); 240228060Sbapt 241228060Sbapt unsigned int count = 0; 242228060Sbapt { 243228060Sbapt Hash_Table representatives (_total_keys, option[NOLENGTH]); 244228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 245228060Sbapt { 246228060Sbapt KeywordExt *keyword = temp->first(); 247228060Sbapt if (representatives.insert (keyword)) 248228060Sbapt count++; 249228060Sbapt } 250228060Sbapt } 251228060Sbapt 252228060Sbapt delete_selchars (); 253228060Sbapt 254228060Sbapt return count; 255228060Sbapt} 256228060Sbapt 257228060Sbapt/* Find good key positions. */ 258228060Sbapt 259228060Sbaptvoid 260228060SbaptSearch::find_positions () 261228060Sbapt{ 262228060Sbapt /* If the user gave the key positions, we use them. */ 263228060Sbapt if (option[POSITIONS]) 264228060Sbapt { 265228060Sbapt _key_positions = option.get_key_positions(); 266228060Sbapt return; 267228060Sbapt } 268228060Sbapt 269228060Sbapt /* Compute preliminary alpha_unify table. */ 270228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (); 271228060Sbapt 272228060Sbapt /* 1. Find positions that must occur in order to distinguish duplicates. */ 273228060Sbapt Positions mandatory; 274228060Sbapt 275228060Sbapt if (!option[DUP]) 276228060Sbapt { 277228060Sbapt for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) 278228060Sbapt { 279228060Sbapt KeywordExt *keyword1 = l1->first(); 280228060Sbapt for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) 281228060Sbapt { 282228060Sbapt KeywordExt *keyword2 = l2->first(); 283228060Sbapt 284228060Sbapt /* If keyword1 and keyword2 have the same length and differ 285228060Sbapt in just one position, and it is not the last character, 286228060Sbapt this position is mandatory. */ 287228060Sbapt if (keyword1->_allchars_length == keyword2->_allchars_length) 288228060Sbapt { 289228060Sbapt int n = keyword1->_allchars_length; 290228060Sbapt int i; 291228060Sbapt for (i = 0; i < n - 1; i++) 292228060Sbapt { 293228060Sbapt unsigned char c1 = keyword1->_allchars[i]; 294228060Sbapt unsigned char c2 = keyword2->_allchars[i]; 295228060Sbapt if (option[UPPERLOWER]) 296228060Sbapt { 297228060Sbapt if (c1 >= 'A' && c1 <= 'Z') 298228060Sbapt c1 += 'a' - 'A'; 299228060Sbapt if (c2 >= 'A' && c2 <= 'Z') 300228060Sbapt c2 += 'a' - 'A'; 301228060Sbapt } 302228060Sbapt if (c1 != c2) 303228060Sbapt break; 304228060Sbapt } 305228060Sbapt if (i < n - 1) 306228060Sbapt { 307228060Sbapt int j; 308228060Sbapt for (j = i + 1; j < n; j++) 309228060Sbapt { 310228060Sbapt unsigned char c1 = keyword1->_allchars[j]; 311228060Sbapt unsigned char c2 = keyword2->_allchars[j]; 312228060Sbapt if (option[UPPERLOWER]) 313228060Sbapt { 314228060Sbapt if (c1 >= 'A' && c1 <= 'Z') 315228060Sbapt c1 += 'a' - 'A'; 316228060Sbapt if (c2 >= 'A' && c2 <= 'Z') 317228060Sbapt c2 += 'a' - 'A'; 318228060Sbapt } 319228060Sbapt if (c1 != c2) 320228060Sbapt break; 321228060Sbapt } 322228060Sbapt if (j >= n) 323228060Sbapt { 324228060Sbapt /* Position i is mandatory. */ 325228060Sbapt if (!mandatory.contains (i)) 326228060Sbapt mandatory.add (i); 327228060Sbapt } 328228060Sbapt } 329228060Sbapt } 330228060Sbapt } 331228060Sbapt } 332228060Sbapt } 333228060Sbapt 334228060Sbapt /* 2. Add positions, as long as this decreases the duplicates count. */ 335228060Sbapt int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 336228060Sbapt ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); 337228060Sbapt Positions current = mandatory; 338228060Sbapt unsigned int current_duplicates_count = 339228060Sbapt count_duplicates_tuple (current, alpha_unify); 340228060Sbapt for (;;) 341228060Sbapt { 342228060Sbapt Positions best; 343228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 344228060Sbapt 345228060Sbapt for (int i = imax; i >= -1; i--) 346228060Sbapt if (!current.contains (i)) 347228060Sbapt { 348228060Sbapt Positions tryal = current; 349228060Sbapt tryal.add (i); 350228060Sbapt unsigned int try_duplicates_count = 351228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 352228060Sbapt 353228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 354228060Sbapt or if it produces the same number of duplicates but with 355228060Sbapt a more efficient hash function. */ 356228060Sbapt if (try_duplicates_count < best_duplicates_count 357228060Sbapt || (try_duplicates_count == best_duplicates_count && i >= 0)) 358228060Sbapt { 359228060Sbapt best = tryal; 360228060Sbapt best_duplicates_count = try_duplicates_count; 361228060Sbapt } 362228060Sbapt } 363228060Sbapt 364228060Sbapt /* Stop adding positions when it gives no improvement. */ 365228060Sbapt if (best_duplicates_count >= current_duplicates_count) 366228060Sbapt break; 367228060Sbapt 368228060Sbapt current = best; 369228060Sbapt current_duplicates_count = best_duplicates_count; 370228060Sbapt } 371228060Sbapt 372228060Sbapt /* 3. Remove positions, as long as this doesn't increase the duplicates 373228060Sbapt count. */ 374228060Sbapt for (;;) 375228060Sbapt { 376228060Sbapt Positions best; 377228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 378228060Sbapt 379228060Sbapt for (int i = imax; i >= -1; i--) 380228060Sbapt if (current.contains (i) && !mandatory.contains (i)) 381228060Sbapt { 382228060Sbapt Positions tryal = current; 383228060Sbapt tryal.remove (i); 384228060Sbapt unsigned int try_duplicates_count = 385228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 386228060Sbapt 387228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 388228060Sbapt or if it produces the same number of duplicates but with 389228060Sbapt a more efficient hash function. */ 390228060Sbapt if (try_duplicates_count < best_duplicates_count 391228060Sbapt || (try_duplicates_count == best_duplicates_count && i == -1)) 392228060Sbapt { 393228060Sbapt best = tryal; 394228060Sbapt best_duplicates_count = try_duplicates_count; 395228060Sbapt } 396228060Sbapt } 397228060Sbapt 398228060Sbapt /* Stop removing positions when it gives no improvement. */ 399228060Sbapt if (best_duplicates_count > current_duplicates_count) 400228060Sbapt break; 401228060Sbapt 402228060Sbapt current = best; 403228060Sbapt current_duplicates_count = best_duplicates_count; 404228060Sbapt } 405228060Sbapt 406228060Sbapt /* 4. Replace two positions by one, as long as this doesn't increase the 407228060Sbapt duplicates count. */ 408228060Sbapt for (;;) 409228060Sbapt { 410228060Sbapt Positions best; 411228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 412228060Sbapt 413228060Sbapt for (int i1 = imax; i1 >= -1; i1--) 414228060Sbapt if (current.contains (i1) && !mandatory.contains (i1)) 415228060Sbapt for (int i2 = imax; i2 >= -1; i2--) 416228060Sbapt if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) 417228060Sbapt for (int i3 = imax; i3 >= 0; i3--) 418228060Sbapt if (!current.contains (i3)) 419228060Sbapt { 420228060Sbapt Positions tryal = current; 421228060Sbapt tryal.remove (i1); 422228060Sbapt tryal.remove (i2); 423228060Sbapt tryal.add (i3); 424228060Sbapt unsigned int try_duplicates_count = 425228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 426228060Sbapt 427228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 428228060Sbapt or if it produces the same number of duplicates but with 429228060Sbapt a more efficient hash function. */ 430228060Sbapt if (try_duplicates_count < best_duplicates_count 431228060Sbapt || (try_duplicates_count == best_duplicates_count 432228060Sbapt && (i1 == -1 || i2 == -1 || i3 >= 0))) 433228060Sbapt { 434228060Sbapt best = tryal; 435228060Sbapt best_duplicates_count = try_duplicates_count; 436228060Sbapt } 437228060Sbapt } 438228060Sbapt 439228060Sbapt /* Stop removing positions when it gives no improvement. */ 440228060Sbapt if (best_duplicates_count > current_duplicates_count) 441228060Sbapt break; 442228060Sbapt 443228060Sbapt current = best; 444228060Sbapt current_duplicates_count = best_duplicates_count; 445228060Sbapt } 446228060Sbapt 447228060Sbapt /* That's it. Hope it's good enough. */ 448228060Sbapt _key_positions = current; 449228060Sbapt 450228060Sbapt if (option[DEBUG]) 451228060Sbapt { 452228060Sbapt /* Print the result. */ 453228060Sbapt fprintf (stderr, "\nComputed positions: "); 454228060Sbapt PositionReverseIterator iter = _key_positions.reviterator(); 455228060Sbapt bool seen_lastchar = false; 456228060Sbapt bool first = true; 457228060Sbapt for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; ) 458228060Sbapt { 459228060Sbapt if (!first) 460228060Sbapt fprintf (stderr, ", "); 461228060Sbapt if (i == Positions::LASTCHAR) 462228060Sbapt seen_lastchar = true; 463228060Sbapt else 464228060Sbapt { 465228060Sbapt fprintf (stderr, "%d", i + 1); 466228060Sbapt first = false; 467228060Sbapt } 468228060Sbapt } 469228060Sbapt if (seen_lastchar) 470228060Sbapt { 471228060Sbapt if (!first) 472228060Sbapt fprintf (stderr, ", "); 473228060Sbapt fprintf (stderr, "$"); 474228060Sbapt } 475228060Sbapt fprintf (stderr, "\n"); 476228060Sbapt } 477228060Sbapt 478228060Sbapt /* Free preliminary alpha_unify table. */ 479228060Sbapt delete[] alpha_unify; 480228060Sbapt} 481228060Sbapt 482228060Sbapt/* Count the duplicate keywords that occur with the found set of positions. 483228060Sbapt In other words, it returns the difference 484228060Sbapt # K - # proj1 (K) 485228060Sbapt where K is the multiset of given keywords. */ 486228060Sbaptunsigned int 487228060SbaptSearch::count_duplicates_tuple () const 488228060Sbapt{ 489228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (); 490228060Sbapt unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); 491228060Sbapt delete[] alpha_unify; 492228060Sbapt return count; 493228060Sbapt} 494228060Sbapt 495228060Sbapt/* ===================== Finding good alpha increments ===================== */ 496228060Sbapt 497228060Sbapt/* Computes the upper bound on the indices passed to asso_values[]. */ 498228060Sbaptunsigned int 499228060SbaptSearch::compute_alpha_size (const unsigned int *alpha_inc) const 500228060Sbapt{ 501228060Sbapt unsigned int max_alpha_inc = 0; 502228060Sbapt for (int i = 0; i < _max_key_len; i++) 503228060Sbapt if (max_alpha_inc < alpha_inc[i]) 504228060Sbapt max_alpha_inc = alpha_inc[i]; 505228060Sbapt return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; 506228060Sbapt} 507228060Sbapt 508228060Sbapt/* Computes the unification rules between different asso_values[c]. */ 509228060Sbaptunsigned int * 510228060SbaptSearch::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const 511228060Sbapt{ 512228060Sbapt if (option[UPPERLOWER]) 513228060Sbapt { 514228060Sbapt /* Without alpha increments, we would simply unify 515228060Sbapt 'A' -> 'a', ..., 'Z' -> 'z'. 516228060Sbapt But when a keyword contains at position i a character c, 517228060Sbapt we have the constraint 518228060Sbapt asso_values[tolower(c) + alpha_inc[i]] == 519228060Sbapt asso_values[toupper(c) + alpha_inc[i]]. 520228060Sbapt This introduces a unification 521228060Sbapt toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. 522228060Sbapt Note that this unification can extend outside the range of 523228060Sbapt ASCII letters! But still every unified character pair is at 524228060Sbapt a distance of 'a'-'A' = 32, or (after chained unification) 525228060Sbapt at a multiple of 32. So in the end the alpha_unify vector has 526228060Sbapt the form c -> c + 32 * f(c) where f(c) is a nonnegative 527228060Sbapt integer. */ 528228060Sbapt unsigned int alpha_size = compute_alpha_size (alpha_inc); 529228060Sbapt 530228060Sbapt unsigned int *alpha_unify = new unsigned int[alpha_size]; 531228060Sbapt for (unsigned int c = 0; c < alpha_size; c++) 532228060Sbapt alpha_unify[c] = c; 533228060Sbapt 534228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 535228060Sbapt { 536228060Sbapt KeywordExt *keyword = temp->first(); 537228060Sbapt 538228060Sbapt /* Iterate through the selected character positions. */ 539228060Sbapt PositionIterator iter = positions.iterator(keyword->_allchars_length); 540228060Sbapt 541228060Sbapt for (int i; (i = iter.next ()) != PositionIterator::EOS; ) 542228060Sbapt { 543228060Sbapt unsigned int c; 544228060Sbapt if (i == Positions::LASTCHAR) 545228060Sbapt c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); 546228060Sbapt else if (i < keyword->_allchars_length) 547228060Sbapt c = static_cast<unsigned char>(keyword->_allchars[i]); 548228060Sbapt else 549228060Sbapt abort (); 550228060Sbapt if (c >= 'A' && c <= 'Z') 551228060Sbapt c += 'a' - 'A'; 552228060Sbapt if (c >= 'a' && c <= 'z') 553228060Sbapt { 554228060Sbapt if (i != Positions::LASTCHAR) 555228060Sbapt c += alpha_inc[i]; 556228060Sbapt /* Unify c with c - ('a'-'A'). */ 557228060Sbapt unsigned int d = alpha_unify[c]; 558228060Sbapt unsigned int b = c - ('a'-'A'); 559228060Sbapt for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) 560228060Sbapt alpha_unify[a] = d; 561228060Sbapt } 562228060Sbapt } 563228060Sbapt } 564228060Sbapt return alpha_unify; 565228060Sbapt } 566228060Sbapt else 567228060Sbapt /* Identity mapping. */ 568228060Sbapt return NULL; 569228060Sbapt} 570228060Sbapt 571228060Sbapt/* Initializes each keyword's _selchars array. */ 572228060Sbaptvoid 573228060SbaptSearch::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const 574228060Sbapt{ 575228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 576228060Sbapt temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc); 577228060Sbapt} 578228060Sbapt 579228060Sbapt/* Count the duplicate keywords that occur with the given set of positions 580228060Sbapt and a given alpha_inc[] array. 581228060Sbapt In other words, it returns the difference 582228060Sbapt # K - # proj2 (proj1 (K)) 583228060Sbapt where K is the multiset of given keywords. */ 584228060Sbaptunsigned int 585228060SbaptSearch::count_duplicates_multiset (const unsigned int *alpha_inc) const 586228060Sbapt{ 587228060Sbapt /* Run through the keyword list and count the duplicates incrementally. 588228060Sbapt The result does not depend on the order of the keyword list, thanks to 589228060Sbapt the formula above. */ 590228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); 591228060Sbapt init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); 592228060Sbapt 593228060Sbapt unsigned int count = 0; 594228060Sbapt { 595228060Sbapt Hash_Table representatives (_total_keys, option[NOLENGTH]); 596228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 597228060Sbapt { 598228060Sbapt KeywordExt *keyword = temp->first(); 599228060Sbapt if (representatives.insert (keyword)) 600228060Sbapt count++; 601228060Sbapt } 602228060Sbapt } 603228060Sbapt 604228060Sbapt delete_selchars (); 605228060Sbapt delete[] alpha_unify; 606228060Sbapt 607228060Sbapt return count; 608228060Sbapt} 609228060Sbapt 610228060Sbapt/* Find good _alpha_inc[]. */ 611228060Sbapt 612228060Sbaptvoid 613228060SbaptSearch::find_alpha_inc () 614228060Sbapt{ 615228060Sbapt /* The goal is to choose _alpha_inc[] such that it doesn't introduce 616228060Sbapt artificial duplicates. 617228060Sbapt In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ 618228060Sbapt unsigned int duplicates_goal = count_duplicates_tuple (); 619228060Sbapt 620228060Sbapt /* Start with zero increments. This is sufficient in most cases. */ 621228060Sbapt unsigned int *current = new unsigned int [_max_key_len]; 622228060Sbapt for (int i = 0; i < _max_key_len; i++) 623228060Sbapt current[i] = 0; 624228060Sbapt unsigned int current_duplicates_count = count_duplicates_multiset (current); 625228060Sbapt 626228060Sbapt if (current_duplicates_count > duplicates_goal) 627228060Sbapt { 628228060Sbapt /* Look which _alpha_inc[i] we are free to increment. */ 629228060Sbapt unsigned int nindices; 630228060Sbapt { 631228060Sbapt nindices = 0; 632228060Sbapt PositionIterator iter = _key_positions.iterator(_max_key_len); 633228060Sbapt for (;;) 634228060Sbapt { 635228060Sbapt int key_pos = iter.next (); 636228060Sbapt if (key_pos == PositionIterator::EOS) 637228060Sbapt break; 638228060Sbapt if (key_pos != Positions::LASTCHAR) 639228060Sbapt nindices++; 640228060Sbapt } 641228060Sbapt } 642228060Sbapt 643228060Sbapt DYNAMIC_ARRAY (indices, unsigned int, nindices); 644228060Sbapt { 645228060Sbapt unsigned int j = 0; 646228060Sbapt PositionIterator iter = _key_positions.iterator(_max_key_len); 647228060Sbapt for (;;) 648228060Sbapt { 649228060Sbapt int key_pos = iter.next (); 650228060Sbapt if (key_pos == PositionIterator::EOS) 651228060Sbapt break; 652228060Sbapt if (key_pos != Positions::LASTCHAR) 653228060Sbapt indices[j++] = key_pos; 654228060Sbapt } 655228060Sbapt if (!(j == nindices)) 656228060Sbapt abort (); 657228060Sbapt } 658228060Sbapt 659228060Sbapt /* Perform several rounds of searching for a good alpha increment. 660228060Sbapt Each round reduces the number of artificial collisions by adding 661228060Sbapt an increment in a single key position. */ 662228060Sbapt DYNAMIC_ARRAY (best, unsigned int, _max_key_len); 663228060Sbapt DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len); 664228060Sbapt do 665228060Sbapt { 666228060Sbapt /* An increment of 1 is not always enough. Try higher increments 667228060Sbapt also. */ 668228060Sbapt for (unsigned int inc = 1; ; inc++) 669228060Sbapt { 670228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 671228060Sbapt 672228060Sbapt for (unsigned int j = 0; j < nindices; j++) 673228060Sbapt { 674228060Sbapt memcpy (tryal, current, _max_key_len * sizeof (unsigned int)); 675228060Sbapt tryal[indices[j]] += inc; 676228060Sbapt unsigned int try_duplicates_count = 677228060Sbapt count_duplicates_multiset (tryal); 678228060Sbapt 679228060Sbapt /* We prefer 'try' to 'best' if it produces less 680228060Sbapt duplicates. */ 681228060Sbapt if (try_duplicates_count < best_duplicates_count) 682228060Sbapt { 683228060Sbapt memcpy (best, tryal, _max_key_len * sizeof (unsigned int)); 684228060Sbapt best_duplicates_count = try_duplicates_count; 685228060Sbapt } 686228060Sbapt } 687228060Sbapt 688228060Sbapt /* Stop this round when we got an improvement. */ 689228060Sbapt if (best_duplicates_count < current_duplicates_count) 690228060Sbapt { 691228060Sbapt memcpy (current, best, _max_key_len * sizeof (unsigned int)); 692228060Sbapt current_duplicates_count = best_duplicates_count; 693228060Sbapt break; 694228060Sbapt } 695228060Sbapt } 696228060Sbapt } 697228060Sbapt while (current_duplicates_count > duplicates_goal); 698228060Sbapt FREE_DYNAMIC_ARRAY (tryal); 699228060Sbapt FREE_DYNAMIC_ARRAY (best); 700228060Sbapt 701228060Sbapt if (option[DEBUG]) 702228060Sbapt { 703228060Sbapt /* Print the result. */ 704228060Sbapt fprintf (stderr, "\nComputed alpha increments: "); 705228060Sbapt bool first = true; 706228060Sbapt for (unsigned int j = nindices; j-- > 0; ) 707228060Sbapt if (current[indices[j]] != 0) 708228060Sbapt { 709228060Sbapt if (!first) 710228060Sbapt fprintf (stderr, ", "); 711228060Sbapt fprintf (stderr, "%u:+%u", 712228060Sbapt indices[j] + 1, current[indices[j]]); 713228060Sbapt first = false; 714228060Sbapt } 715228060Sbapt fprintf (stderr, "\n"); 716228060Sbapt } 717228060Sbapt FREE_DYNAMIC_ARRAY (indices); 718228060Sbapt } 719228060Sbapt 720228060Sbapt _alpha_inc = current; 721228060Sbapt _alpha_size = compute_alpha_size (_alpha_inc); 722228060Sbapt _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); 723228060Sbapt} 724228060Sbapt 725228060Sbapt/* ======================= Finding good asso_values ======================== */ 726228060Sbapt 727228060Sbapt/* Initializes the asso_values[] related parameters. */ 728228060Sbapt 729228060Sbaptvoid 730228060SbaptSearch::prepare_asso_values () 731228060Sbapt{ 732228060Sbapt KeywordExt_List *temp; 733228060Sbapt 734228060Sbapt /* Initialize each keyword's _selchars array. */ 735228060Sbapt init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc); 736228060Sbapt 737228060Sbapt /* Compute the maximum _selchars_length over all keywords. */ 738228060Sbapt _max_selchars_length = _key_positions.iterator(_max_key_len).remaining(); 739228060Sbapt 740228060Sbapt /* Check for duplicates, i.e. keywords with the same _selchars array 741228060Sbapt (and - if !option[NOLENGTH] - also the same length). 742228060Sbapt We deal with these by building an equivalence class, so that only 743228060Sbapt 1 keyword is representative of the entire collection. Only this 744228060Sbapt representative remains in the keyword list; the others are accessible 745228060Sbapt through the _duplicate_link chain, starting at the representative. 746228060Sbapt This *greatly* simplifies processing during later stages of the program. 747228060Sbapt Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */ 748228060Sbapt { 749228060Sbapt _list_len = _total_keys; 750228060Sbapt _total_duplicates = 0; 751228060Sbapt /* Make hash table for efficiency. */ 752228060Sbapt Hash_Table representatives (_list_len, option[NOLENGTH]); 753228060Sbapt 754228060Sbapt KeywordExt_List *prev = NULL; /* list node before temp */ 755228060Sbapt for (temp = _head; temp; ) 756228060Sbapt { 757228060Sbapt KeywordExt *keyword = temp->first(); 758228060Sbapt KeywordExt *other_keyword = representatives.insert (keyword); 759228060Sbapt KeywordExt_List *garbage = NULL; 760228060Sbapt 761228060Sbapt if (other_keyword) 762228060Sbapt { 763228060Sbapt _total_duplicates++; 764228060Sbapt _list_len--; 765228060Sbapt /* Remove keyword from the main list. */ 766228060Sbapt prev->rest() = temp->rest(); 767228060Sbapt garbage = temp; 768228060Sbapt /* And insert it on other_keyword's duplicate list. */ 769228060Sbapt keyword->_duplicate_link = other_keyword->_duplicate_link; 770228060Sbapt other_keyword->_duplicate_link = keyword; 771228060Sbapt 772228060Sbapt /* Complain if user hasn't enabled the duplicate option. */ 773228060Sbapt if (!option[DUP] || option[DEBUG]) 774228060Sbapt { 775228060Sbapt fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", 776228060Sbapt keyword->_allchars_length, keyword->_allchars, 777228060Sbapt other_keyword->_allchars_length, other_keyword->_allchars); 778228060Sbapt for (int j = 0; j < keyword->_selchars_length; j++) 779228060Sbapt putc (keyword->_selchars[j], stderr); 780228060Sbapt fprintf (stderr, "\".\n"); 781228060Sbapt } 782228060Sbapt } 783228060Sbapt else 784228060Sbapt { 785228060Sbapt keyword->_duplicate_link = NULL; 786228060Sbapt prev = temp; 787228060Sbapt } 788228060Sbapt temp = temp->rest(); 789228060Sbapt if (garbage) 790228060Sbapt delete garbage; 791228060Sbapt } 792228060Sbapt if (option[DEBUG]) 793228060Sbapt representatives.dump(); 794228060Sbapt } 795228060Sbapt 796228060Sbapt /* Exit program if duplicates exists and option[DUP] not set, since we 797228060Sbapt don't want to continue in this case. (We don't want to turn on 798228060Sbapt option[DUP] implicitly, because the generated code is usually much 799228060Sbapt slower. */ 800228060Sbapt if (_total_duplicates) 801228060Sbapt { 802228060Sbapt if (option[DUP]) 803228060Sbapt fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n", 804228060Sbapt _total_duplicates); 805228060Sbapt else 806228060Sbapt { 807228060Sbapt fprintf (stderr, "%d input keys have identical hash values,\n", 808228060Sbapt _total_duplicates); 809228060Sbapt if (option[POSITIONS]) 810228060Sbapt fprintf (stderr, "try different key positions or use option -D.\n"); 811228060Sbapt else 812228060Sbapt fprintf (stderr, "use option -D.\n"); 813228060Sbapt exit (1); 814228060Sbapt } 815228060Sbapt } 816228060Sbapt 817228060Sbapt /* Compute the occurrences of each character in the alphabet. */ 818228060Sbapt _occurrences = new int[_alpha_size]; 819228060Sbapt memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); 820228060Sbapt for (temp = _head; temp; temp = temp->rest()) 821228060Sbapt { 822228060Sbapt KeywordExt *keyword = temp->first(); 823228060Sbapt const unsigned int *ptr = keyword->_selchars; 824228060Sbapt for (int count = keyword->_selchars_length; count > 0; ptr++, count--) 825228060Sbapt _occurrences[*ptr]++; 826228060Sbapt } 827228060Sbapt 828228060Sbapt /* Memory allocation. */ 829228060Sbapt _asso_values = new int[_alpha_size]; 830228060Sbapt 831228060Sbapt int non_linked_length = _list_len; 832228060Sbapt unsigned int asso_value_max; 833228060Sbapt 834228060Sbapt asso_value_max = 835228060Sbapt static_cast<unsigned int>(non_linked_length * option.get_size_multiple()); 836228060Sbapt /* Round up to the next power of two. This makes it easy to ensure 837228060Sbapt an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value 838228060Sbapt being odd, it guarantees that Search::try_asso_value() will iterate 839228060Sbapt through different values for _asso_value[c]. */ 840228060Sbapt if (asso_value_max == 0) 841228060Sbapt asso_value_max = 1; 842228060Sbapt asso_value_max |= asso_value_max >> 1; 843228060Sbapt asso_value_max |= asso_value_max >> 2; 844228060Sbapt asso_value_max |= asso_value_max >> 4; 845228060Sbapt asso_value_max |= asso_value_max >> 8; 846228060Sbapt asso_value_max |= asso_value_max >> 16; 847228060Sbapt asso_value_max++; 848228060Sbapt _asso_value_max = asso_value_max; 849228060Sbapt 850228060Sbapt /* Given the bound for _asso_values[c], we have a bound for the possible 851228060Sbapt hash values, as computed in compute_hash(). */ 852228060Sbapt _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len) 853228060Sbapt + (_asso_value_max - 1) * _max_selchars_length; 854228060Sbapt /* Allocate a sparse bit vector for detection of collisions of hash 855228060Sbapt values. */ 856228060Sbapt _collision_detector = new Bool_Array (_max_hash_value + 1); 857228060Sbapt 858228060Sbapt if (option[DEBUG]) 859228060Sbapt { 860228060Sbapt fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d" 861228060Sbapt "\nmaximum size of generated hash table is %d\n", 862228060Sbapt non_linked_length, asso_value_max, _max_hash_value); 863228060Sbapt 864228060Sbapt int field_width; 865228060Sbapt 866228060Sbapt field_width = 0; 867228060Sbapt { 868228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 869228060Sbapt { 870228060Sbapt KeywordExt *keyword = temp->first(); 871228060Sbapt if (field_width < keyword->_selchars_length) 872228060Sbapt field_width = keyword->_selchars_length; 873228060Sbapt } 874228060Sbapt } 875228060Sbapt 876228060Sbapt fprintf (stderr, "\ndumping the keyword list without duplicates\n"); 877228060Sbapt fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig"); 878228060Sbapt int i = 0; 879228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 880228060Sbapt { 881228060Sbapt KeywordExt *keyword = temp->first(); 882228060Sbapt fprintf (stderr, "%9d, ", ++i); 883228060Sbapt if (field_width > keyword->_selchars_length) 884228060Sbapt fprintf (stderr, "%*s", field_width - keyword->_selchars_length, ""); 885228060Sbapt for (int j = 0; j < keyword->_selchars_length; j++) 886228060Sbapt putc (keyword->_selchars[j], stderr); 887228060Sbapt fprintf (stderr, ", %.*s\n", 888228060Sbapt keyword->_allchars_length, keyword->_allchars); 889228060Sbapt } 890228060Sbapt fprintf (stderr, "\nend of keyword list\n\n"); 891228060Sbapt } 892228060Sbapt 893228060Sbapt if (option[RANDOM] || option.get_jump () == 0) 894228060Sbapt /* We will use rand(), so initialize the random number generator. */ 895228060Sbapt srand (static_cast<long>(time (0))); 896228060Sbapt 897228060Sbapt _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ()); 898228060Sbapt _jump = option.get_jump (); 899228060Sbapt} 900228060Sbapt 901228060Sbapt/* Finds some _asso_values[] that fit. */ 902228060Sbapt 903228060Sbapt/* The idea is to choose the _asso_values[] one by one, in a way that 904228060Sbapt a choice that has been made never needs to be undone later. This 905228060Sbapt means that we split the work into several steps. Each step chooses 906228060Sbapt one or more _asso_values[c]. The result of choosing one or more 907228060Sbapt _asso_values[c] is that the partitioning of the keyword set gets 908228060Sbapt broader. 909228060Sbapt Look at this partitioning: After every step, the _asso_values[] of a 910228060Sbapt certain set C of characters are undetermined. (At the beginning, C 911228060Sbapt is the set of characters c with _occurrences[c] > 0. At the end, C 912228060Sbapt is empty.) To each keyword K, we associate the multiset of _selchars 913228060Sbapt for which the _asso_values[] are undetermined: 914228060Sbapt K --> K->_selchars intersect C. 915228060Sbapt Consider two keywords equivalent if their value under this mapping is 916228060Sbapt the same. This introduces an equivalence relation on the set of 917228060Sbapt keywords. The equivalence classes partition the keyword set. (At the 918228060Sbapt beginning, the partition is the finest possible: each K is an equivalence 919228060Sbapt class by itself, because all K have a different _selchars. At the end, 920228060Sbapt all K have been merged into a single equivalence class.) 921228060Sbapt The partition before a step is always a refinement of the partition 922228060Sbapt after the step. 923228060Sbapt We choose the steps in such a way that the partition really becomes 924228060Sbapt broader at each step. (A step that only chooses an _asso_values[c] 925228060Sbapt without changing the partition is better merged with the previous step, 926228060Sbapt to avoid useless backtracking.) */ 927228060Sbapt 928228060Sbaptstruct EquivalenceClass 929228060Sbapt{ 930228060Sbapt /* The keywords in this equivalence class. */ 931228060Sbapt KeywordExt_List * _keywords; 932228060Sbapt KeywordExt_List * _keywords_last; 933228060Sbapt /* The number of keywords in this equivalence class. */ 934228060Sbapt unsigned int _cardinality; 935228060Sbapt /* The undetermined selected characters for the keywords in this 936228060Sbapt equivalence class, as a canonically reordered multiset. */ 937228060Sbapt unsigned int * _undetermined_chars; 938228060Sbapt unsigned int _undetermined_chars_length; 939228060Sbapt 940228060Sbapt EquivalenceClass * _next; 941228060Sbapt}; 942228060Sbapt 943228060Sbaptstruct Step 944228060Sbapt{ 945228060Sbapt /* The characters whose values are being determined in this step. */ 946228060Sbapt unsigned int _changing_count; 947228060Sbapt unsigned int * _changing; 948228060Sbapt /* Exclusive upper bound for the _asso_values[c] of this step. 949228060Sbapt A power of 2. */ 950228060Sbapt unsigned int _asso_value_max; 951228060Sbapt /* The characters whose values will be determined after this step. */ 952228060Sbapt bool * _undetermined; 953228060Sbapt /* The keyword set partition after this step. */ 954228060Sbapt EquivalenceClass * _partition; 955228060Sbapt /* The expected number of iterations in this step. */ 956228060Sbapt double _expected_lower; 957228060Sbapt double _expected_upper; 958228060Sbapt 959228060Sbapt Step * _next; 960228060Sbapt}; 961228060Sbapt 962228060Sbaptstatic inline bool 963228060Sbaptequals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) 964228060Sbapt{ 965228060Sbapt while (len > 0) 966228060Sbapt { 967228060Sbapt if (*ptr1 != *ptr2) 968228060Sbapt return false; 969228060Sbapt ptr1++; 970228060Sbapt ptr2++; 971228060Sbapt len--; 972228060Sbapt } 973228060Sbapt return true; 974228060Sbapt} 975228060Sbapt 976228060SbaptEquivalenceClass * 977228060SbaptSearch::compute_partition (bool *undetermined) const 978228060Sbapt{ 979228060Sbapt EquivalenceClass *partition = NULL; 980228060Sbapt EquivalenceClass *partition_last = NULL; 981228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 982228060Sbapt { 983228060Sbapt KeywordExt *keyword = temp->first(); 984228060Sbapt 985228060Sbapt /* Compute the undetermined characters for this keyword. */ 986228060Sbapt unsigned int *undetermined_chars = 987228060Sbapt new unsigned int[keyword->_selchars_length]; 988228060Sbapt unsigned int undetermined_chars_length = 0; 989228060Sbapt 990228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 991228060Sbapt if (undetermined[keyword->_selchars[i]]) 992228060Sbapt undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; 993228060Sbapt 994228060Sbapt /* Look up the equivalence class to which this keyword belongs. */ 995228060Sbapt EquivalenceClass *equclass; 996228060Sbapt for (equclass = partition; equclass; equclass = equclass->_next) 997228060Sbapt if (equclass->_undetermined_chars_length == undetermined_chars_length 998228060Sbapt && equals (equclass->_undetermined_chars, undetermined_chars, 999228060Sbapt undetermined_chars_length)) 1000228060Sbapt break; 1001228060Sbapt if (equclass == NULL) 1002228060Sbapt { 1003228060Sbapt equclass = new EquivalenceClass(); 1004228060Sbapt equclass->_keywords = NULL; 1005228060Sbapt equclass->_keywords_last = NULL; 1006228060Sbapt equclass->_cardinality = 0; 1007228060Sbapt equclass->_undetermined_chars = undetermined_chars; 1008228060Sbapt equclass->_undetermined_chars_length = undetermined_chars_length; 1009228060Sbapt equclass->_next = NULL; 1010228060Sbapt if (partition) 1011228060Sbapt partition_last->_next = equclass; 1012228060Sbapt else 1013228060Sbapt partition = equclass; 1014228060Sbapt partition_last = equclass; 1015228060Sbapt } 1016228060Sbapt else 1017228060Sbapt delete[] undetermined_chars; 1018228060Sbapt 1019228060Sbapt /* Add the keyword to the equivalence class. */ 1020228060Sbapt KeywordExt_List *cons = new KeywordExt_List(keyword); 1021228060Sbapt if (equclass->_keywords) 1022228060Sbapt equclass->_keywords_last->rest() = cons; 1023228060Sbapt else 1024228060Sbapt equclass->_keywords = cons; 1025228060Sbapt equclass->_keywords_last = cons; 1026228060Sbapt equclass->_cardinality++; 1027228060Sbapt } 1028228060Sbapt 1029228060Sbapt /* Free some of the allocated memory. The caller doesn't need it. */ 1030228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1031228060Sbapt delete[] cls->_undetermined_chars; 1032228060Sbapt 1033228060Sbapt return partition; 1034228060Sbapt} 1035228060Sbapt 1036228060Sbaptstatic void 1037228060Sbaptdelete_partition (EquivalenceClass *partition) 1038228060Sbapt{ 1039228060Sbapt while (partition != NULL) 1040228060Sbapt { 1041228060Sbapt EquivalenceClass *equclass = partition; 1042228060Sbapt partition = equclass->_next; 1043228060Sbapt delete_list (equclass->_keywords); 1044228060Sbapt //delete[] equclass->_undetermined_chars; // already freed above 1045228060Sbapt delete equclass; 1046228060Sbapt } 1047228060Sbapt} 1048228060Sbapt 1049228060Sbapt/* Compute the possible number of collisions when _asso_values[c] is 1050228060Sbapt chosen, leading to the given partition. */ 1051228060Sbaptunsigned int 1052228060SbaptSearch::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const 1053228060Sbapt{ 1054228060Sbapt /* Every equivalence class p is split according to the frequency of 1055228060Sbapt occurrence of c, leading to equivalence classes p1, p2, ... 1056228060Sbapt This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions. 1057228060Sbapt Return the sum of this expression over all equivalence classes. */ 1058228060Sbapt unsigned int sum = 0; 1059228060Sbapt unsigned int m = _max_selchars_length; 1060228060Sbapt DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1); 1061228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1062228060Sbapt { 1063228060Sbapt for (unsigned int i = 0; i <= m; i++) 1064228060Sbapt split_cardinalities[i] = 0; 1065228060Sbapt 1066228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1067228060Sbapt { 1068228060Sbapt KeywordExt *keyword = temp->first(); 1069228060Sbapt 1070228060Sbapt unsigned int count = 0; 1071228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 1072228060Sbapt if (keyword->_selchars[i] == c) 1073228060Sbapt count++; 1074228060Sbapt 1075228060Sbapt split_cardinalities[count]++; 1076228060Sbapt } 1077228060Sbapt 1078228060Sbapt sum += cls->_cardinality * cls->_cardinality; 1079228060Sbapt for (unsigned int i = 0; i <= m; i++) 1080228060Sbapt sum -= split_cardinalities[i] * split_cardinalities[i]; 1081228060Sbapt } 1082228060Sbapt FREE_DYNAMIC_ARRAY (split_cardinalities); 1083228060Sbapt return sum; 1084228060Sbapt} 1085228060Sbapt 1086228060Sbapt/* Test whether adding c to the undetermined characters changes the given 1087228060Sbapt partition. */ 1088228060Sbaptbool 1089228060SbaptSearch::unchanged_partition (EquivalenceClass *partition, unsigned int c) const 1090228060Sbapt{ 1091228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1092228060Sbapt { 1093228060Sbapt unsigned int first_count = UINT_MAX; 1094228060Sbapt 1095228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1096228060Sbapt { 1097228060Sbapt KeywordExt *keyword = temp->first(); 1098228060Sbapt 1099228060Sbapt unsigned int count = 0; 1100228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 1101228060Sbapt if (keyword->_selchars[i] == c) 1102228060Sbapt count++; 1103228060Sbapt 1104228060Sbapt if (temp == cls->_keywords) 1105228060Sbapt first_count = count; 1106228060Sbapt else if (count != first_count) 1107228060Sbapt /* c would split this equivalence class. */ 1108228060Sbapt return false; 1109228060Sbapt } 1110228060Sbapt } 1111228060Sbapt return true; 1112228060Sbapt} 1113228060Sbapt 1114228060Sbaptvoid 1115228060SbaptSearch::find_asso_values () 1116228060Sbapt{ 1117228060Sbapt Step *steps; 1118228060Sbapt 1119228060Sbapt /* Determine the steps, starting with the last one. */ 1120228060Sbapt { 1121228060Sbapt bool *undetermined; 1122228060Sbapt bool *determined; 1123228060Sbapt 1124228060Sbapt steps = NULL; 1125228060Sbapt 1126228060Sbapt undetermined = new bool[_alpha_size]; 1127228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1128228060Sbapt undetermined[c] = false; 1129228060Sbapt 1130228060Sbapt determined = new bool[_alpha_size]; 1131228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1132228060Sbapt determined[c] = true; 1133228060Sbapt 1134228060Sbapt for (;;) 1135228060Sbapt { 1136228060Sbapt /* Compute the partition that needs to be refined. */ 1137228060Sbapt EquivalenceClass *partition = compute_partition (undetermined); 1138228060Sbapt 1139228060Sbapt /* Determine the main character to be chosen in this step. 1140228060Sbapt Choosing such a character c has the effect of splitting every 1141228060Sbapt equivalence class (according the the frequency of occurrence of c). 1142228060Sbapt We choose the c with the minimum number of possible collisions, 1143228060Sbapt so that characters which lead to a large number of collisions get 1144228060Sbapt handled early during the search. */ 1145228060Sbapt unsigned int chosen_c; 1146228060Sbapt unsigned int chosen_possible_collisions; 1147228060Sbapt { 1148228060Sbapt unsigned int best_c = 0; 1149228060Sbapt unsigned int best_possible_collisions = UINT_MAX; 1150228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1151228060Sbapt if (_occurrences[c] > 0 && determined[c]) 1152228060Sbapt { 1153228060Sbapt unsigned int possible_collisions = 1154228060Sbapt count_possible_collisions (partition, c); 1155228060Sbapt if (possible_collisions < best_possible_collisions) 1156228060Sbapt { 1157228060Sbapt best_c = c; 1158228060Sbapt best_possible_collisions = possible_collisions; 1159228060Sbapt } 1160228060Sbapt } 1161228060Sbapt if (best_possible_collisions == UINT_MAX) 1162228060Sbapt { 1163228060Sbapt /* All c with _occurrences[c] > 0 are undetermined. We are 1164228060Sbapt are the starting situation and don't need any more step. */ 1165228060Sbapt delete_partition (partition); 1166228060Sbapt break; 1167228060Sbapt } 1168228060Sbapt chosen_c = best_c; 1169228060Sbapt chosen_possible_collisions = best_possible_collisions; 1170228060Sbapt } 1171228060Sbapt 1172228060Sbapt /* We need one more step. */ 1173228060Sbapt Step *step = new Step(); 1174228060Sbapt 1175228060Sbapt step->_undetermined = new bool[_alpha_size]; 1176228060Sbapt memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool)); 1177228060Sbapt 1178228060Sbapt step->_partition = partition; 1179228060Sbapt 1180228060Sbapt /* Now determine how the equivalence classes will be before this 1181228060Sbapt step. */ 1182228060Sbapt undetermined[chosen_c] = true; 1183228060Sbapt partition = compute_partition (undetermined); 1184228060Sbapt 1185228060Sbapt /* Now determine which other characters should be determined in this 1186228060Sbapt step, because they will not change the equivalence classes at 1187228060Sbapt this point. It is the set of all c which, for all equivalence 1188228060Sbapt classes, have the same frequency of occurrence in every keyword 1189228060Sbapt of the equivalence class. */ 1190228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1191228060Sbapt if (_occurrences[c] > 0 && determined[c] 1192228060Sbapt && unchanged_partition (partition, c)) 1193228060Sbapt { 1194228060Sbapt undetermined[c] = true; 1195228060Sbapt determined[c] = false; 1196228060Sbapt } 1197228060Sbapt 1198228060Sbapt /* main_c must be one of these. */ 1199228060Sbapt if (determined[chosen_c]) 1200228060Sbapt abort (); 1201228060Sbapt 1202228060Sbapt /* Now the set of changing characters of this step. */ 1203228060Sbapt unsigned int changing_count; 1204228060Sbapt 1205228060Sbapt changing_count = 0; 1206228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1207228060Sbapt if (undetermined[c] && !step->_undetermined[c]) 1208228060Sbapt changing_count++; 1209228060Sbapt 1210228060Sbapt unsigned int *changing = new unsigned int[changing_count]; 1211228060Sbapt changing_count = 0; 1212228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1213228060Sbapt if (undetermined[c] && !step->_undetermined[c]) 1214228060Sbapt changing[changing_count++] = c; 1215228060Sbapt 1216228060Sbapt step->_changing = changing; 1217228060Sbapt step->_changing_count = changing_count; 1218228060Sbapt 1219228060Sbapt step->_asso_value_max = _asso_value_max; 1220228060Sbapt 1221228060Sbapt step->_expected_lower = 1222228060Sbapt exp (static_cast<double>(chosen_possible_collisions) 1223228060Sbapt / static_cast<double>(_max_hash_value)); 1224228060Sbapt step->_expected_upper = 1225228060Sbapt exp (static_cast<double>(chosen_possible_collisions) 1226228060Sbapt / static_cast<double>(_asso_value_max)); 1227228060Sbapt 1228228060Sbapt delete_partition (partition); 1229228060Sbapt 1230228060Sbapt step->_next = steps; 1231228060Sbapt steps = step; 1232228060Sbapt } 1233228060Sbapt 1234228060Sbapt delete[] determined; 1235228060Sbapt delete[] undetermined; 1236228060Sbapt } 1237228060Sbapt 1238228060Sbapt if (option[DEBUG]) 1239228060Sbapt { 1240228060Sbapt unsigned int stepno = 0; 1241228060Sbapt for (Step *step = steps; step; step = step->_next) 1242228060Sbapt { 1243228060Sbapt stepno++; 1244228060Sbapt fprintf (stderr, "Step %u chooses _asso_values[", stepno); 1245228060Sbapt for (unsigned int i = 0; i < step->_changing_count; i++) 1246228060Sbapt { 1247228060Sbapt if (i > 0) 1248228060Sbapt fprintf (stderr, ","); 1249228060Sbapt fprintf (stderr, "'%c'", step->_changing[i]); 1250228060Sbapt } 1251228060Sbapt fprintf (stderr, "], expected number of iterations between %g and %g.\n", 1252228060Sbapt step->_expected_lower, step->_expected_upper); 1253228060Sbapt fprintf (stderr, "Keyword equivalence classes:\n"); 1254228060Sbapt for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1255228060Sbapt { 1256228060Sbapt fprintf (stderr, "\n"); 1257228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1258228060Sbapt { 1259228060Sbapt KeywordExt *keyword = temp->first(); 1260228060Sbapt fprintf (stderr, " %.*s\n", 1261228060Sbapt keyword->_allchars_length, keyword->_allchars); 1262228060Sbapt } 1263228060Sbapt } 1264228060Sbapt fprintf (stderr, "\n"); 1265228060Sbapt } 1266228060Sbapt } 1267228060Sbapt 1268228060Sbapt /* Initialize _asso_values[]. (The value given here matters only 1269228060Sbapt for those c which occur in all keywords with equal multiplicity.) */ 1270228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1271228060Sbapt _asso_values[c] = 0; 1272228060Sbapt 1273228060Sbapt unsigned int stepno = 0; 1274228060Sbapt for (Step *step = steps; step; step = step->_next) 1275228060Sbapt { 1276228060Sbapt stepno++; 1277228060Sbapt 1278228060Sbapt /* Initialize the asso_values[]. */ 1279228060Sbapt unsigned int k = step->_changing_count; 1280228060Sbapt for (unsigned int i = 0; i < k; i++) 1281228060Sbapt { 1282228060Sbapt unsigned int c = step->_changing[i]; 1283228060Sbapt _asso_values[c] = 1284228060Sbapt (_initial_asso_value < 0 ? rand () : _initial_asso_value) 1285228060Sbapt & (step->_asso_value_max - 1); 1286228060Sbapt } 1287228060Sbapt 1288228060Sbapt unsigned int iterations = 0; 1289228060Sbapt DYNAMIC_ARRAY (iter, unsigned int, k); 1290228060Sbapt for (unsigned int i = 0; i < k; i++) 1291228060Sbapt iter[i] = 0; 1292228060Sbapt unsigned int ii = (_jump != 0 ? k - 1 : 0); 1293228060Sbapt 1294228060Sbapt for (;;) 1295228060Sbapt { 1296228060Sbapt /* Test whether these asso_values[] lead to collisions among 1297228060Sbapt the equivalence classes that should be collision-free. */ 1298228060Sbapt bool has_collision = false; 1299228060Sbapt for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1300228060Sbapt { 1301228060Sbapt /* Iteration Number array is a win, O(1) initialization time! */ 1302228060Sbapt _collision_detector->clear (); 1303228060Sbapt 1304228060Sbapt for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest()) 1305228060Sbapt { 1306228060Sbapt KeywordExt *keyword = ptr->first(); 1307228060Sbapt 1308228060Sbapt /* Compute the new hash code for the keyword, leaving apart 1309228060Sbapt the yet undetermined asso_values[]. */ 1310228060Sbapt int hashcode; 1311228060Sbapt { 1312228060Sbapt int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1313228060Sbapt const unsigned int *p = keyword->_selchars; 1314228060Sbapt int i = keyword->_selchars_length; 1315228060Sbapt for (; i > 0; p++, i--) 1316228060Sbapt if (!step->_undetermined[*p]) 1317228060Sbapt sum += _asso_values[*p]; 1318228060Sbapt hashcode = sum; 1319228060Sbapt } 1320228060Sbapt 1321228060Sbapt /* See whether it collides with another keyword's hash code, 1322228060Sbapt from the same equivalence class. */ 1323228060Sbapt if (_collision_detector->set_bit (hashcode)) 1324228060Sbapt { 1325228060Sbapt has_collision = true; 1326228060Sbapt break; 1327228060Sbapt } 1328228060Sbapt } 1329228060Sbapt 1330228060Sbapt /* Don't need to continue looking at the other equivalence 1331228060Sbapt classes if we already have found a collision. */ 1332228060Sbapt if (has_collision) 1333228060Sbapt break; 1334228060Sbapt } 1335228060Sbapt 1336228060Sbapt iterations++; 1337228060Sbapt if (!has_collision) 1338228060Sbapt break; 1339228060Sbapt 1340228060Sbapt /* Try other asso_values[]. */ 1341228060Sbapt if (_jump != 0) 1342228060Sbapt { 1343228060Sbapt /* The way we try various values for 1344228060Sbapt asso_values[step->_changing[0],...step->_changing[k-1]] 1345228060Sbapt is like this: 1346228060Sbapt for (bound = 0,1,...) 1347228060Sbapt for (ii = 0,...,k-1) 1348228060Sbapt iter[ii] := bound 1349228060Sbapt iter[0..ii-1] := values <= bound 1350228060Sbapt iter[ii+1..k-1] := values < bound 1351228060Sbapt and 1352228060Sbapt asso_values[step->_changing[i]] = 1353228060Sbapt _initial_asso_value + iter[i] * _jump. 1354228060Sbapt This makes it more likely to find small asso_values[]. 1355228060Sbapt */ 1356228060Sbapt unsigned int bound = iter[ii]; 1357228060Sbapt unsigned int i = 0; 1358228060Sbapt while (i < ii) 1359228060Sbapt { 1360228060Sbapt unsigned int c = step->_changing[i]; 1361228060Sbapt iter[i]++; 1362228060Sbapt _asso_values[c] = 1363228060Sbapt (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1364228060Sbapt if (iter[i] <= bound) 1365228060Sbapt goto found_next; 1366228060Sbapt _asso_values[c] = 1367228060Sbapt (_asso_values[c] - iter[i] * _jump) 1368228060Sbapt & (step->_asso_value_max - 1); 1369228060Sbapt iter[i] = 0; 1370228060Sbapt i++; 1371228060Sbapt } 1372228060Sbapt i = ii + 1; 1373228060Sbapt while (i < k) 1374228060Sbapt { 1375228060Sbapt unsigned int c = step->_changing[i]; 1376228060Sbapt iter[i]++; 1377228060Sbapt _asso_values[c] = 1378228060Sbapt (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1379228060Sbapt if (iter[i] < bound) 1380228060Sbapt goto found_next; 1381228060Sbapt _asso_values[c] = 1382228060Sbapt (_asso_values[c] - iter[i] * _jump) 1383228060Sbapt & (step->_asso_value_max - 1); 1384228060Sbapt iter[i] = 0; 1385228060Sbapt i++; 1386228060Sbapt } 1387228060Sbapt /* Switch from one ii to the next. */ 1388228060Sbapt { 1389228060Sbapt unsigned int c = step->_changing[ii]; 1390228060Sbapt _asso_values[c] = 1391228060Sbapt (_asso_values[c] - bound * _jump) 1392228060Sbapt & (step->_asso_value_max - 1); 1393228060Sbapt iter[ii] = 0; 1394228060Sbapt } 1395228060Sbapt /* Here all iter[i] == 0. */ 1396228060Sbapt ii++; 1397228060Sbapt if (ii == k) 1398228060Sbapt { 1399228060Sbapt ii = 0; 1400228060Sbapt bound++; 1401228060Sbapt if (bound == step->_asso_value_max) 1402228060Sbapt { 1403228060Sbapt /* Out of search space! We can either backtrack, or 1404228060Sbapt increase the available search space of this step. 1405228060Sbapt It seems simpler to choose the latter solution. */ 1406228060Sbapt step->_asso_value_max = 2 * step->_asso_value_max; 1407228060Sbapt if (step->_asso_value_max > _asso_value_max) 1408228060Sbapt { 1409228060Sbapt _asso_value_max = step->_asso_value_max; 1410228060Sbapt /* Reinitialize _max_hash_value. */ 1411228060Sbapt _max_hash_value = 1412228060Sbapt (option[NOLENGTH] ? 0 : _max_key_len) 1413228060Sbapt + (_asso_value_max - 1) * _max_selchars_length; 1414228060Sbapt /* Reinitialize _collision_detector. */ 1415228060Sbapt delete _collision_detector; 1416228060Sbapt _collision_detector = 1417228060Sbapt new Bool_Array (_max_hash_value + 1); 1418228060Sbapt } 1419228060Sbapt } 1420228060Sbapt } 1421228060Sbapt { 1422228060Sbapt unsigned int c = step->_changing[ii]; 1423228060Sbapt iter[ii] = bound; 1424228060Sbapt _asso_values[c] = 1425228060Sbapt (_asso_values[c] + bound * _jump) 1426228060Sbapt & (step->_asso_value_max - 1); 1427228060Sbapt } 1428228060Sbapt found_next: ; 1429228060Sbapt } 1430228060Sbapt else 1431228060Sbapt { 1432228060Sbapt /* Random. */ 1433228060Sbapt unsigned int c = step->_changing[ii]; 1434228060Sbapt _asso_values[c] = 1435228060Sbapt (_asso_values[c] + rand ()) & (step->_asso_value_max - 1); 1436228060Sbapt /* Next time, change the next c. */ 1437228060Sbapt ii++; 1438228060Sbapt if (ii == k) 1439228060Sbapt ii = 0; 1440228060Sbapt } 1441228060Sbapt } 1442228060Sbapt FREE_DYNAMIC_ARRAY (iter); 1443228060Sbapt 1444228060Sbapt if (option[DEBUG]) 1445228060Sbapt { 1446228060Sbapt fprintf (stderr, "Step %u chose _asso_values[", stepno); 1447228060Sbapt for (unsigned int i = 0; i < step->_changing_count; i++) 1448228060Sbapt { 1449228060Sbapt if (i > 0) 1450228060Sbapt fprintf (stderr, ","); 1451228060Sbapt fprintf (stderr, "'%c'", step->_changing[i]); 1452228060Sbapt } 1453228060Sbapt fprintf (stderr, "] in %u iterations.\n", iterations); 1454228060Sbapt } 1455228060Sbapt } 1456228060Sbapt 1457228060Sbapt /* Free allocated memory. */ 1458228060Sbapt while (steps != NULL) 1459228060Sbapt { 1460228060Sbapt Step *step = steps; 1461228060Sbapt steps = step->_next; 1462228060Sbapt delete[] step->_changing; 1463228060Sbapt delete[] step->_undetermined; 1464228060Sbapt delete_partition (step->_partition); 1465228060Sbapt delete step; 1466228060Sbapt } 1467228060Sbapt} 1468228060Sbapt 1469228060Sbapt/* Computes a keyword's hash value, relative to the current _asso_values[], 1470228060Sbapt and stores it in keyword->_hash_value. */ 1471228060Sbapt 1472228060Sbaptinline int 1473228060SbaptSearch::compute_hash (KeywordExt *keyword) const 1474228060Sbapt{ 1475228060Sbapt int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1476228060Sbapt 1477228060Sbapt const unsigned int *p = keyword->_selchars; 1478228060Sbapt int i = keyword->_selchars_length; 1479228060Sbapt for (; i > 0; p++, i--) 1480228060Sbapt sum += _asso_values[*p]; 1481228060Sbapt 1482228060Sbapt return keyword->_hash_value = sum; 1483228060Sbapt} 1484228060Sbapt 1485228060Sbapt/* Finds good _asso_values[]. */ 1486228060Sbapt 1487228060Sbaptvoid 1488228060SbaptSearch::find_good_asso_values () 1489228060Sbapt{ 1490228060Sbapt prepare_asso_values (); 1491228060Sbapt 1492228060Sbapt /* Search for good _asso_values[]. */ 1493228060Sbapt int asso_iteration; 1494228060Sbapt if ((asso_iteration = option.get_asso_iterations ()) == 0) 1495228060Sbapt /* Try only the given _initial_asso_value and _jump. */ 1496228060Sbapt find_asso_values (); 1497228060Sbapt else 1498228060Sbapt { 1499228060Sbapt /* Try different pairs of _initial_asso_value and _jump, in the 1500228060Sbapt following order: 1501228060Sbapt (0, 1) 1502228060Sbapt (1, 1) 1503228060Sbapt (2, 1) (0, 3) 1504228060Sbapt (3, 1) (1, 3) 1505228060Sbapt (4, 1) (2, 3) (0, 5) 1506228060Sbapt (5, 1) (3, 3) (1, 5) 1507228060Sbapt ..... */ 1508228060Sbapt KeywordExt_List *saved_head = _head; 1509228060Sbapt int best_initial_asso_value = 0; 1510228060Sbapt int best_jump = 1; 1511228060Sbapt int *best_asso_values = new int[_alpha_size]; 1512228060Sbapt int best_collisions = INT_MAX; 1513228060Sbapt int best_max_hash_value = INT_MAX; 1514228060Sbapt 1515228060Sbapt _initial_asso_value = 0; _jump = 1; 1516228060Sbapt for (;;) 1517228060Sbapt { 1518228060Sbapt /* Restore the keyword list in its original order. */ 1519228060Sbapt _head = copy_list (saved_head); 1520228060Sbapt /* Find good _asso_values[]. */ 1521228060Sbapt find_asso_values (); 1522228060Sbapt /* Test whether it is the best solution so far. */ 1523228060Sbapt int collisions = 0; 1524228060Sbapt int max_hash_value = INT_MIN; 1525228060Sbapt _collision_detector->clear (); 1526228060Sbapt for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1527228060Sbapt { 1528228060Sbapt KeywordExt *keyword = ptr->first(); 1529228060Sbapt int hashcode = compute_hash (keyword); 1530228060Sbapt if (max_hash_value < hashcode) 1531228060Sbapt max_hash_value = hashcode; 1532228060Sbapt if (_collision_detector->set_bit (hashcode)) 1533228060Sbapt collisions++; 1534228060Sbapt } 1535228060Sbapt if (collisions < best_collisions 1536228060Sbapt || (collisions == best_collisions 1537228060Sbapt && max_hash_value < best_max_hash_value)) 1538228060Sbapt { 1539228060Sbapt memcpy (best_asso_values, _asso_values, 1540228060Sbapt _alpha_size * sizeof (_asso_values[0])); 1541228060Sbapt best_collisions = collisions; 1542228060Sbapt best_max_hash_value = max_hash_value; 1543228060Sbapt } 1544228060Sbapt /* Delete the copied keyword list. */ 1545228060Sbapt delete_list (_head); 1546228060Sbapt 1547228060Sbapt if (--asso_iteration == 0) 1548228060Sbapt break; 1549228060Sbapt /* Prepare for next iteration. */ 1550228060Sbapt if (_initial_asso_value >= 2) 1551228060Sbapt _initial_asso_value -= 2, _jump += 2; 1552228060Sbapt else 1553228060Sbapt _initial_asso_value += _jump, _jump = 1; 1554228060Sbapt } 1555228060Sbapt _head = saved_head; 1556228060Sbapt /* Install the best found asso_values. */ 1557228060Sbapt _initial_asso_value = best_initial_asso_value; 1558228060Sbapt _jump = best_jump; 1559228060Sbapt memcpy (_asso_values, best_asso_values, 1560228060Sbapt _alpha_size * sizeof (_asso_values[0])); 1561228060Sbapt delete[] best_asso_values; 1562228060Sbapt /* The keywords' _hash_value fields are recomputed below. */ 1563228060Sbapt } 1564228060Sbapt} 1565228060Sbapt 1566228060Sbapt/* ========================================================================= */ 1567228060Sbapt 1568228060Sbapt/* Comparison function for sorting by increasing _hash_value. */ 1569228060Sbaptstatic bool 1570228060Sbaptless_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2) 1571228060Sbapt{ 1572228060Sbapt return keyword1->_hash_value < keyword2->_hash_value; 1573228060Sbapt} 1574228060Sbapt 1575228060Sbapt/* Sorts the keyword list by hash value. */ 1576228060Sbapt 1577228060Sbaptvoid 1578228060SbaptSearch::sort () 1579228060Sbapt{ 1580228060Sbapt _head = mergesort_list (_head, less_by_hash_value); 1581228060Sbapt} 1582228060Sbapt 1583228060Sbaptvoid 1584228060SbaptSearch::optimize () 1585228060Sbapt{ 1586228060Sbapt /* Preparations. */ 1587228060Sbapt prepare (); 1588228060Sbapt 1589228060Sbapt /* Step 1: Finding good byte positions. */ 1590228060Sbapt find_positions (); 1591228060Sbapt 1592228060Sbapt /* Step 2: Finding good alpha increments. */ 1593228060Sbapt find_alpha_inc (); 1594228060Sbapt 1595228060Sbapt /* Step 3: Finding good asso_values. */ 1596228060Sbapt find_good_asso_values (); 1597228060Sbapt 1598228060Sbapt /* Make one final check, just to make sure nothing weird happened.... */ 1599228060Sbapt _collision_detector->clear (); 1600228060Sbapt for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest()) 1601228060Sbapt { 1602228060Sbapt KeywordExt *curr = curr_ptr->first(); 1603228060Sbapt unsigned int hashcode = compute_hash (curr); 1604228060Sbapt if (_collision_detector->set_bit (hashcode)) 1605228060Sbapt { 1606228060Sbapt /* This shouldn't happen. proj1, proj2, proj3 must have been 1607228060Sbapt computed to be injective on the given keyword set. */ 1608228060Sbapt fprintf (stderr, 1609228060Sbapt "\nInternal error, unexpected duplicate hash code\n"); 1610228060Sbapt if (option[POSITIONS]) 1611228060Sbapt fprintf (stderr, "try options -m or -r, or use new key positions.\n\n"); 1612228060Sbapt else 1613228060Sbapt fprintf (stderr, "try options -m or -r.\n\n"); 1614228060Sbapt exit (1); 1615228060Sbapt } 1616228060Sbapt } 1617228060Sbapt 1618228060Sbapt /* Sorts the keyword list by hash value. */ 1619228060Sbapt sort (); 1620228060Sbapt 1621228060Sbapt /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely 1622228060Sbapt necessary, but speeds up the lookup function in many cases of lookup 1623228060Sbapt failure: no string comparison is needed once the hash value of a string 1624228060Sbapt is larger than the hash value of any keyword. */ 1625228060Sbapt int max_hash_value; 1626228060Sbapt { 1627228060Sbapt KeywordExt_List *temp; 1628228060Sbapt for (temp = _head; temp->rest(); temp = temp->rest()) 1629228060Sbapt ; 1630228060Sbapt max_hash_value = temp->first()->_hash_value; 1631228060Sbapt } 1632228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1633228060Sbapt if (_occurrences[c] == 0) 1634228060Sbapt _asso_values[c] = max_hash_value + 1; 1635228060Sbapt 1636228060Sbapt /* Propagate unified asso_values. */ 1637228060Sbapt if (_alpha_unify) 1638228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1639228060Sbapt if (_alpha_unify[c] != c) 1640228060Sbapt _asso_values[c] = _asso_values[_alpha_unify[c]]; 1641228060Sbapt} 1642228060Sbapt 1643228060Sbapt/* Prints out some diagnostics upon completion. */ 1644228060Sbapt 1645228060SbaptSearch::~Search () 1646228060Sbapt{ 1647228060Sbapt delete _collision_detector; 1648228060Sbapt if (option[DEBUG]) 1649228060Sbapt { 1650228060Sbapt fprintf (stderr, "\ndumping occurrence and associated values tables\n"); 1651228060Sbapt 1652228060Sbapt for (unsigned int i = 0; i < _alpha_size; i++) 1653228060Sbapt if (_occurrences[i]) 1654228060Sbapt fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", 1655228060Sbapt i, _asso_values[i], i, _occurrences[i]); 1656228060Sbapt 1657228060Sbapt fprintf (stderr, "end table dumping\n"); 1658228060Sbapt 1659228060Sbapt fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d" 1660228060Sbapt "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", 1661228060Sbapt _list_len, _total_keys, _total_duplicates, _max_key_len); 1662228060Sbapt 1663228060Sbapt int field_width = _max_selchars_length; 1664228060Sbapt fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", 1665228060Sbapt field_width, "selchars"); 1666228060Sbapt for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1667228060Sbapt { 1668228060Sbapt fprintf (stderr, "%11d,%11d,%6d, ", 1669228060Sbapt ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index); 1670228060Sbapt if (field_width > ptr->first()->_selchars_length) 1671228060Sbapt fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, ""); 1672228060Sbapt for (int j = 0; j < ptr->first()->_selchars_length; j++) 1673228060Sbapt putc (ptr->first()->_selchars[j], stderr); 1674228060Sbapt fprintf (stderr, ", %.*s\n", 1675228060Sbapt ptr->first()->_allchars_length, ptr->first()->_allchars); 1676228060Sbapt } 1677228060Sbapt 1678228060Sbapt fprintf (stderr, "End dumping list.\n\n"); 1679228060Sbapt } 1680228060Sbapt delete[] _asso_values; 1681228060Sbapt delete[] _occurrences; 1682228060Sbapt delete[] _alpha_unify; 1683228060Sbapt delete[] _alpha_inc; 1684228060Sbapt} 1685