1228060Sbapt/* Search algorithm. 2228060Sbapt Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc. 3228060Sbapt Written by Douglas C. Schmidt <schmidt@ics.uci.edu> 4228060Sbapt and Bruno Haible <bruno@clisp.org>. 5228060Sbapt 6228060Sbapt This file is part of GNU GPERF. 7228060Sbapt 8228060Sbapt GNU GPERF is free software; you can redistribute it and/or modify 9228060Sbapt it under the terms of the GNU General Public License as published by 10228060Sbapt the Free Software Foundation; either version 2, or (at your option) 11228060Sbapt any later version. 12228060Sbapt 13228060Sbapt GNU GPERF is distributed in the hope that it will be useful, 14228060Sbapt but WITHOUT ANY WARRANTY; without even the implied warranty of 15228060Sbapt MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16228060Sbapt GNU General Public License for more details. 17228060Sbapt 18228060Sbapt You should have received a copy of the GNU General Public License 19228060Sbapt along with this program; see the file COPYING. 20228060Sbapt If not, write to the Free Software Foundation, Inc., 21228060Sbapt 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22228060Sbapt 23228060Sbapt/* Specification. */ 24228060Sbapt#include "search.h" 25228060Sbapt 26228060Sbapt#include <stdio.h> 27228060Sbapt#include <stdlib.h> /* declares exit(), rand(), srand() */ 28228060Sbapt#include <string.h> /* declares memset(), memcmp() */ 29228060Sbapt#include <time.h> /* declares time() */ 30228060Sbapt#include <math.h> /* declares exp() */ 31228060Sbapt#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ 32228060Sbapt#include "options.h" 33228060Sbapt#include "hash-table.h" 34228060Sbapt#include "config.h" 35228060Sbapt 36228060Sbapt/* ============================== Portability ============================== */ 37228060Sbapt 38228060Sbapt/* Assume ISO C++ 'for' scoping rule. */ 39257085Ssbruno/* This code is used to work around scoping issues with visual studio 6 from 40257085Ssbruno * 1998. Comment it out here to queisce numerous -Wdangling-else warnings 41257085Ssbruno * from clang. 42257085Ssbruno#define for if (0) ; else for */ 43228060Sbapt 44228060Sbapt/* Dynamically allocated array with dynamic extent: 45228060Sbapt 46228060Sbapt Example: 47228060Sbapt DYNAMIC_ARRAY (my_array, int, n); 48228060Sbapt ... 49228060Sbapt FREE_DYNAMIC_ARRAY (my_array); 50228060Sbapt 51228060Sbapt Attention: depending on your implementation my_array is either the array 52228060Sbapt itself or a pointer to the array! Always use my_array only as expression! 53228060Sbapt */ 54228060Sbapt#if HAVE_DYNAMIC_ARRAY 55228060Sbapt #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size] 56228060Sbapt #define FREE_DYNAMIC_ARRAY(var) 57228060Sbapt#else 58228060Sbapt #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size] 59228060Sbapt #define FREE_DYNAMIC_ARRAY(var) delete[] var 60228060Sbapt#endif 61228060Sbapt 62228060Sbapt/* ================================ Theory ================================= */ 63228060Sbapt 64228060Sbapt/* The general form of the hash function is 65228060Sbapt 66228060Sbapt hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) 67228060Sbapt + len (keyword) 68228060Sbapt 69228060Sbapt where Pos is a set of byte positions, 70228060Sbapt each alpha_inc[i] is a nonnegative integer, 71228060Sbapt each asso_values[c] is a nonnegative integer, 72228060Sbapt len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise. 73228060Sbapt 74228060Sbapt Theorem 1: If all keywords are different, there is a set Pos such that 75228060Sbapt all tuples (keyword[i] : i in Pos) are different. 76228060Sbapt 77228060Sbapt Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there 78228060Sbapt are nonnegative integers alpha_inc[i] such that all multisets 79228060Sbapt {keyword[i] + alpha_inc[i] : i in Pos} are different. 80228060Sbapt 81228060Sbapt Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. 82228060Sbapt 83228060Sbapt Theorem 3: If all multisets selchars[keyword] are different, there are 84228060Sbapt nonnegative integers asso_values[c] such that all hash values 85228060Sbapt sum (asso_values[c] : c in selchars[keyword]) are different. 86228060Sbapt 87228060Sbapt Based on these three facts, we find the hash function in three steps: 88228060Sbapt 89228060Sbapt Step 1 (Finding good byte positions): 90228060Sbapt Find a set Pos, as small as possible, such that all tuples 91228060Sbapt (keyword[i] : i in Pos) are different. 92228060Sbapt 93228060Sbapt Step 2 (Finding good alpha increments): 94228060Sbapt Find nonnegative integers alpha_inc[i], as many of them as possible being 95228060Sbapt zero, and the others being as small as possible, such that all multisets 96228060Sbapt {keyword[i] + alpha_inc[i] : i in Pos} are different. 97228060Sbapt 98228060Sbapt Step 3 (Finding good asso_values): 99228060Sbapt Find asso_values[c] such that all hash (keyword) are different. 100228060Sbapt 101228060Sbapt In other words, each step finds a projection that is injective on the 102228060Sbapt given finite set: 103228060Sbapt proj1 : String --> Map (Pos --> N) 104228060Sbapt proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) 105228060Sbapt proj3 : Map (Pos --> N) / S(Pos) --> N 106228060Sbapt where 107228060Sbapt N denotes the set of nonnegative integers, 108228060Sbapt Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and 109228060Sbapt S(Pos) is the symmetric group over Pos. 110228060Sbapt 111228060Sbapt This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight 112228060Sbapt modifications apply: 113228060Sbapt proj1 : String --> Map (Pos --> N) x N 114228060Sbapt proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N 115228060Sbapt proj3 : Map (Pos --> N) / S(Pos) x N --> N 116228060Sbapt 117228060Sbapt For a case-insensitive hash function, the general form is 118228060Sbapt 119228060Sbapt hash (keyword) = 120228060Sbapt sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) 121228060Sbapt + len (keyword) 122228060Sbapt 123228060Sbapt where alpha_unify[c] is chosen so that an upper/lower case change in 124228060Sbapt keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. 125228060Sbapt */ 126228060Sbapt 127228060Sbapt/* ==================== Initialization and Preparation ===================== */ 128228060Sbapt 129228060SbaptSearch::Search (KeywordExt_List *list) 130228060Sbapt : _head (list) 131228060Sbapt{ 132228060Sbapt} 133228060Sbapt 134228060Sbaptvoid 135228060SbaptSearch::prepare () 136228060Sbapt{ 137228060Sbapt /* Compute the total number of keywords. */ 138228060Sbapt _total_keys = 0; 139228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 140228060Sbapt _total_keys++; 141228060Sbapt 142228060Sbapt /* Compute the minimum and maximum keyword length. */ 143228060Sbapt _max_key_len = INT_MIN; 144228060Sbapt _min_key_len = INT_MAX; 145228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 146228060Sbapt { 147228060Sbapt KeywordExt *keyword = temp->first(); 148228060Sbapt 149228060Sbapt if (_max_key_len < keyword->_allchars_length) 150228060Sbapt _max_key_len = keyword->_allchars_length; 151228060Sbapt if (_min_key_len > keyword->_allchars_length) 152228060Sbapt _min_key_len = keyword->_allchars_length; 153228060Sbapt } 154228060Sbapt 155228060Sbapt /* Exit program if an empty string is used as keyword, since the comparison 156228060Sbapt expressions don't work correctly for looking up an empty string. */ 157228060Sbapt if (_min_key_len == 0) 158228060Sbapt { 159228060Sbapt fprintf (stderr, "Empty input keyword is not allowed.\n" 160228060Sbapt "To recognize an empty input keyword, your code should check for\n" 161228060Sbapt "len == 0 before calling the gperf generated lookup function.\n"); 162228060Sbapt exit (1); 163228060Sbapt } 164228060Sbapt 165228060Sbapt /* Exit program if the characters in the keywords are not in the required 166228060Sbapt range. */ 167228060Sbapt if (option[SEVENBIT]) 168228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 169228060Sbapt { 170228060Sbapt KeywordExt *keyword = temp->first(); 171228060Sbapt 172228060Sbapt const char *k = keyword->_allchars; 173228060Sbapt for (int i = keyword->_allchars_length; i > 0; k++, i--) 174228060Sbapt if (!(static_cast<unsigned char>(*k) < 128)) 175228060Sbapt { 176228060Sbapt fprintf (stderr, "Option --seven-bit has been specified,\n" 177228060Sbapt "but keyword \"%.*s\" contains non-ASCII characters.\n" 178228060Sbapt "Try removing option --seven-bit.\n", 179228060Sbapt keyword->_allchars_length, keyword->_allchars); 180228060Sbapt exit (1); 181228060Sbapt } 182228060Sbapt } 183228060Sbapt} 184228060Sbapt 185228060Sbapt/* ====================== Finding good byte positions ====================== */ 186228060Sbapt 187228060Sbapt/* Computes the upper bound on the indices passed to asso_values[], 188228060Sbapt assuming no alpha_increments. */ 189228060Sbaptunsigned int 190228060SbaptSearch::compute_alpha_size () const 191228060Sbapt{ 192228060Sbapt return (option[SEVENBIT] ? 128 : 256); 193228060Sbapt} 194228060Sbapt 195228060Sbapt/* Computes the unification rules between different asso_values[c], 196228060Sbapt assuming no alpha_increments. */ 197228060Sbaptunsigned int * 198228060SbaptSearch::compute_alpha_unify () const 199228060Sbapt{ 200228060Sbapt if (option[UPPERLOWER]) 201228060Sbapt { 202228060Sbapt /* Uppercase to lowercase mapping. */ 203228060Sbapt unsigned int alpha_size = compute_alpha_size(); 204228060Sbapt unsigned int *alpha_unify = new unsigned int[alpha_size]; 205228060Sbapt for (unsigned int c = 0; c < alpha_size; c++) 206228060Sbapt alpha_unify[c] = c; 207228060Sbapt for (unsigned int c = 'A'; c <= 'Z'; c++) 208228060Sbapt alpha_unify[c] = c + ('a'-'A'); 209228060Sbapt return alpha_unify; 210228060Sbapt } 211228060Sbapt else 212228060Sbapt /* Identity mapping. */ 213228060Sbapt return NULL; 214228060Sbapt} 215228060Sbapt 216228060Sbapt/* Initializes each keyword's _selchars array. */ 217228060Sbaptvoid 218228060SbaptSearch::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const 219228060Sbapt{ 220228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 221228060Sbapt temp->first()->init_selchars_tuple(positions, alpha_unify); 222228060Sbapt} 223228060Sbapt 224228060Sbapt/* Deletes each keyword's _selchars array. */ 225228060Sbaptvoid 226228060SbaptSearch::delete_selchars () const 227228060Sbapt{ 228228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 229228060Sbapt temp->first()->delete_selchars(); 230228060Sbapt} 231228060Sbapt 232228060Sbapt/* Count the duplicate keywords that occur with a given set of positions. 233228060Sbapt In other words, it returns the difference 234228060Sbapt # K - # proj1 (K) 235228060Sbapt where K is the multiset of given keywords. */ 236228060Sbaptunsigned int 237228060SbaptSearch::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const 238228060Sbapt{ 239228060Sbapt /* Run through the keyword list and count the duplicates incrementally. 240228060Sbapt The result does not depend on the order of the keyword list, thanks to 241228060Sbapt the formula above. */ 242228060Sbapt init_selchars_tuple (positions, alpha_unify); 243228060Sbapt 244228060Sbapt unsigned int count = 0; 245228060Sbapt { 246228060Sbapt Hash_Table representatives (_total_keys, option[NOLENGTH]); 247228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 248228060Sbapt { 249228060Sbapt KeywordExt *keyword = temp->first(); 250228060Sbapt if (representatives.insert (keyword)) 251228060Sbapt count++; 252228060Sbapt } 253228060Sbapt } 254228060Sbapt 255228060Sbapt delete_selchars (); 256228060Sbapt 257228060Sbapt return count; 258228060Sbapt} 259228060Sbapt 260228060Sbapt/* Find good key positions. */ 261228060Sbapt 262228060Sbaptvoid 263228060SbaptSearch::find_positions () 264228060Sbapt{ 265228060Sbapt /* If the user gave the key positions, we use them. */ 266228060Sbapt if (option[POSITIONS]) 267228060Sbapt { 268228060Sbapt _key_positions = option.get_key_positions(); 269228060Sbapt return; 270228060Sbapt } 271228060Sbapt 272228060Sbapt /* Compute preliminary alpha_unify table. */ 273228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (); 274228060Sbapt 275228060Sbapt /* 1. Find positions that must occur in order to distinguish duplicates. */ 276228060Sbapt Positions mandatory; 277228060Sbapt 278228060Sbapt if (!option[DUP]) 279228060Sbapt { 280228060Sbapt for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) 281228060Sbapt { 282228060Sbapt KeywordExt *keyword1 = l1->first(); 283228060Sbapt for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) 284228060Sbapt { 285228060Sbapt KeywordExt *keyword2 = l2->first(); 286228060Sbapt 287228060Sbapt /* If keyword1 and keyword2 have the same length and differ 288228060Sbapt in just one position, and it is not the last character, 289228060Sbapt this position is mandatory. */ 290228060Sbapt if (keyword1->_allchars_length == keyword2->_allchars_length) 291228060Sbapt { 292228060Sbapt int n = keyword1->_allchars_length; 293228060Sbapt int i; 294228060Sbapt for (i = 0; i < n - 1; i++) 295228060Sbapt { 296228060Sbapt unsigned char c1 = keyword1->_allchars[i]; 297228060Sbapt unsigned char c2 = keyword2->_allchars[i]; 298228060Sbapt if (option[UPPERLOWER]) 299228060Sbapt { 300228060Sbapt if (c1 >= 'A' && c1 <= 'Z') 301228060Sbapt c1 += 'a' - 'A'; 302228060Sbapt if (c2 >= 'A' && c2 <= 'Z') 303228060Sbapt c2 += 'a' - 'A'; 304228060Sbapt } 305228060Sbapt if (c1 != c2) 306228060Sbapt break; 307228060Sbapt } 308228060Sbapt if (i < n - 1) 309228060Sbapt { 310228060Sbapt int j; 311228060Sbapt for (j = i + 1; j < n; j++) 312228060Sbapt { 313228060Sbapt unsigned char c1 = keyword1->_allchars[j]; 314228060Sbapt unsigned char c2 = keyword2->_allchars[j]; 315228060Sbapt if (option[UPPERLOWER]) 316228060Sbapt { 317228060Sbapt if (c1 >= 'A' && c1 <= 'Z') 318228060Sbapt c1 += 'a' - 'A'; 319228060Sbapt if (c2 >= 'A' && c2 <= 'Z') 320228060Sbapt c2 += 'a' - 'A'; 321228060Sbapt } 322228060Sbapt if (c1 != c2) 323228060Sbapt break; 324228060Sbapt } 325228060Sbapt if (j >= n) 326228060Sbapt { 327228060Sbapt /* Position i is mandatory. */ 328228060Sbapt if (!mandatory.contains (i)) 329228060Sbapt mandatory.add (i); 330228060Sbapt } 331228060Sbapt } 332228060Sbapt } 333228060Sbapt } 334228060Sbapt } 335228060Sbapt } 336228060Sbapt 337228060Sbapt /* 2. Add positions, as long as this decreases the duplicates count. */ 338228060Sbapt int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 339228060Sbapt ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); 340228060Sbapt Positions current = mandatory; 341228060Sbapt unsigned int current_duplicates_count = 342228060Sbapt count_duplicates_tuple (current, alpha_unify); 343228060Sbapt for (;;) 344228060Sbapt { 345228060Sbapt Positions best; 346228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 347228060Sbapt 348228060Sbapt for (int i = imax; i >= -1; i--) 349228060Sbapt if (!current.contains (i)) 350228060Sbapt { 351228060Sbapt Positions tryal = current; 352228060Sbapt tryal.add (i); 353228060Sbapt unsigned int try_duplicates_count = 354228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 355228060Sbapt 356228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 357228060Sbapt or if it produces the same number of duplicates but with 358228060Sbapt a more efficient hash function. */ 359228060Sbapt if (try_duplicates_count < best_duplicates_count 360228060Sbapt || (try_duplicates_count == best_duplicates_count && i >= 0)) 361228060Sbapt { 362228060Sbapt best = tryal; 363228060Sbapt best_duplicates_count = try_duplicates_count; 364228060Sbapt } 365228060Sbapt } 366228060Sbapt 367228060Sbapt /* Stop adding positions when it gives no improvement. */ 368228060Sbapt if (best_duplicates_count >= current_duplicates_count) 369228060Sbapt break; 370228060Sbapt 371228060Sbapt current = best; 372228060Sbapt current_duplicates_count = best_duplicates_count; 373228060Sbapt } 374228060Sbapt 375228060Sbapt /* 3. Remove positions, as long as this doesn't increase the duplicates 376228060Sbapt count. */ 377228060Sbapt for (;;) 378228060Sbapt { 379228060Sbapt Positions best; 380228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 381228060Sbapt 382228060Sbapt for (int i = imax; i >= -1; i--) 383228060Sbapt if (current.contains (i) && !mandatory.contains (i)) 384228060Sbapt { 385228060Sbapt Positions tryal = current; 386228060Sbapt tryal.remove (i); 387228060Sbapt unsigned int try_duplicates_count = 388228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 389228060Sbapt 390228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 391228060Sbapt or if it produces the same number of duplicates but with 392228060Sbapt a more efficient hash function. */ 393228060Sbapt if (try_duplicates_count < best_duplicates_count 394228060Sbapt || (try_duplicates_count == best_duplicates_count && i == -1)) 395228060Sbapt { 396228060Sbapt best = tryal; 397228060Sbapt best_duplicates_count = try_duplicates_count; 398228060Sbapt } 399228060Sbapt } 400228060Sbapt 401228060Sbapt /* Stop removing positions when it gives no improvement. */ 402228060Sbapt if (best_duplicates_count > current_duplicates_count) 403228060Sbapt break; 404228060Sbapt 405228060Sbapt current = best; 406228060Sbapt current_duplicates_count = best_duplicates_count; 407228060Sbapt } 408228060Sbapt 409228060Sbapt /* 4. Replace two positions by one, as long as this doesn't increase the 410228060Sbapt duplicates count. */ 411228060Sbapt for (;;) 412228060Sbapt { 413228060Sbapt Positions best; 414228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 415228060Sbapt 416228060Sbapt for (int i1 = imax; i1 >= -1; i1--) 417228060Sbapt if (current.contains (i1) && !mandatory.contains (i1)) 418228060Sbapt for (int i2 = imax; i2 >= -1; i2--) 419228060Sbapt if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) 420228060Sbapt for (int i3 = imax; i3 >= 0; i3--) 421228060Sbapt if (!current.contains (i3)) 422228060Sbapt { 423228060Sbapt Positions tryal = current; 424228060Sbapt tryal.remove (i1); 425228060Sbapt tryal.remove (i2); 426228060Sbapt tryal.add (i3); 427228060Sbapt unsigned int try_duplicates_count = 428228060Sbapt count_duplicates_tuple (tryal, alpha_unify); 429228060Sbapt 430228060Sbapt /* We prefer 'try' to 'best' if it produces less duplicates, 431228060Sbapt or if it produces the same number of duplicates but with 432228060Sbapt a more efficient hash function. */ 433228060Sbapt if (try_duplicates_count < best_duplicates_count 434228060Sbapt || (try_duplicates_count == best_duplicates_count 435228060Sbapt && (i1 == -1 || i2 == -1 || i3 >= 0))) 436228060Sbapt { 437228060Sbapt best = tryal; 438228060Sbapt best_duplicates_count = try_duplicates_count; 439228060Sbapt } 440228060Sbapt } 441228060Sbapt 442228060Sbapt /* Stop removing positions when it gives no improvement. */ 443228060Sbapt if (best_duplicates_count > current_duplicates_count) 444228060Sbapt break; 445228060Sbapt 446228060Sbapt current = best; 447228060Sbapt current_duplicates_count = best_duplicates_count; 448228060Sbapt } 449228060Sbapt 450228060Sbapt /* That's it. Hope it's good enough. */ 451228060Sbapt _key_positions = current; 452228060Sbapt 453228060Sbapt if (option[DEBUG]) 454228060Sbapt { 455228060Sbapt /* Print the result. */ 456228060Sbapt fprintf (stderr, "\nComputed positions: "); 457228060Sbapt PositionReverseIterator iter = _key_positions.reviterator(); 458228060Sbapt bool seen_lastchar = false; 459228060Sbapt bool first = true; 460228060Sbapt for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; ) 461228060Sbapt { 462228060Sbapt if (!first) 463228060Sbapt fprintf (stderr, ", "); 464228060Sbapt if (i == Positions::LASTCHAR) 465228060Sbapt seen_lastchar = true; 466228060Sbapt else 467228060Sbapt { 468228060Sbapt fprintf (stderr, "%d", i + 1); 469228060Sbapt first = false; 470228060Sbapt } 471228060Sbapt } 472228060Sbapt if (seen_lastchar) 473228060Sbapt { 474228060Sbapt if (!first) 475228060Sbapt fprintf (stderr, ", "); 476228060Sbapt fprintf (stderr, "$"); 477228060Sbapt } 478228060Sbapt fprintf (stderr, "\n"); 479228060Sbapt } 480228060Sbapt 481228060Sbapt /* Free preliminary alpha_unify table. */ 482228060Sbapt delete[] alpha_unify; 483228060Sbapt} 484228060Sbapt 485228060Sbapt/* Count the duplicate keywords that occur with the found set of positions. 486228060Sbapt In other words, it returns the difference 487228060Sbapt # K - # proj1 (K) 488228060Sbapt where K is the multiset of given keywords. */ 489228060Sbaptunsigned int 490228060SbaptSearch::count_duplicates_tuple () const 491228060Sbapt{ 492228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (); 493228060Sbapt unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); 494228060Sbapt delete[] alpha_unify; 495228060Sbapt return count; 496228060Sbapt} 497228060Sbapt 498228060Sbapt/* ===================== Finding good alpha increments ===================== */ 499228060Sbapt 500228060Sbapt/* Computes the upper bound on the indices passed to asso_values[]. */ 501228060Sbaptunsigned int 502228060SbaptSearch::compute_alpha_size (const unsigned int *alpha_inc) const 503228060Sbapt{ 504228060Sbapt unsigned int max_alpha_inc = 0; 505228060Sbapt for (int i = 0; i < _max_key_len; i++) 506228060Sbapt if (max_alpha_inc < alpha_inc[i]) 507228060Sbapt max_alpha_inc = alpha_inc[i]; 508228060Sbapt return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; 509228060Sbapt} 510228060Sbapt 511228060Sbapt/* Computes the unification rules between different asso_values[c]. */ 512228060Sbaptunsigned int * 513228060SbaptSearch::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const 514228060Sbapt{ 515228060Sbapt if (option[UPPERLOWER]) 516228060Sbapt { 517228060Sbapt /* Without alpha increments, we would simply unify 518228060Sbapt 'A' -> 'a', ..., 'Z' -> 'z'. 519228060Sbapt But when a keyword contains at position i a character c, 520228060Sbapt we have the constraint 521228060Sbapt asso_values[tolower(c) + alpha_inc[i]] == 522228060Sbapt asso_values[toupper(c) + alpha_inc[i]]. 523228060Sbapt This introduces a unification 524228060Sbapt toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. 525228060Sbapt Note that this unification can extend outside the range of 526228060Sbapt ASCII letters! But still every unified character pair is at 527228060Sbapt a distance of 'a'-'A' = 32, or (after chained unification) 528228060Sbapt at a multiple of 32. So in the end the alpha_unify vector has 529228060Sbapt the form c -> c + 32 * f(c) where f(c) is a nonnegative 530228060Sbapt integer. */ 531228060Sbapt unsigned int alpha_size = compute_alpha_size (alpha_inc); 532228060Sbapt 533228060Sbapt unsigned int *alpha_unify = new unsigned int[alpha_size]; 534228060Sbapt for (unsigned int c = 0; c < alpha_size; c++) 535228060Sbapt alpha_unify[c] = c; 536228060Sbapt 537228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 538228060Sbapt { 539228060Sbapt KeywordExt *keyword = temp->first(); 540228060Sbapt 541228060Sbapt /* Iterate through the selected character positions. */ 542228060Sbapt PositionIterator iter = positions.iterator(keyword->_allchars_length); 543228060Sbapt 544228060Sbapt for (int i; (i = iter.next ()) != PositionIterator::EOS; ) 545228060Sbapt { 546228060Sbapt unsigned int c; 547228060Sbapt if (i == Positions::LASTCHAR) 548228060Sbapt c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); 549228060Sbapt else if (i < keyword->_allchars_length) 550228060Sbapt c = static_cast<unsigned char>(keyword->_allchars[i]); 551228060Sbapt else 552228060Sbapt abort (); 553228060Sbapt if (c >= 'A' && c <= 'Z') 554228060Sbapt c += 'a' - 'A'; 555228060Sbapt if (c >= 'a' && c <= 'z') 556228060Sbapt { 557228060Sbapt if (i != Positions::LASTCHAR) 558228060Sbapt c += alpha_inc[i]; 559228060Sbapt /* Unify c with c - ('a'-'A'). */ 560228060Sbapt unsigned int d = alpha_unify[c]; 561228060Sbapt unsigned int b = c - ('a'-'A'); 562228060Sbapt for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) 563228060Sbapt alpha_unify[a] = d; 564228060Sbapt } 565228060Sbapt } 566228060Sbapt } 567228060Sbapt return alpha_unify; 568228060Sbapt } 569228060Sbapt else 570228060Sbapt /* Identity mapping. */ 571228060Sbapt return NULL; 572228060Sbapt} 573228060Sbapt 574228060Sbapt/* Initializes each keyword's _selchars array. */ 575228060Sbaptvoid 576228060SbaptSearch::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const 577228060Sbapt{ 578228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 579228060Sbapt temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc); 580228060Sbapt} 581228060Sbapt 582228060Sbapt/* Count the duplicate keywords that occur with the given set of positions 583228060Sbapt and a given alpha_inc[] array. 584228060Sbapt In other words, it returns the difference 585228060Sbapt # K - # proj2 (proj1 (K)) 586228060Sbapt where K is the multiset of given keywords. */ 587228060Sbaptunsigned int 588228060SbaptSearch::count_duplicates_multiset (const unsigned int *alpha_inc) const 589228060Sbapt{ 590228060Sbapt /* Run through the keyword list and count the duplicates incrementally. 591228060Sbapt The result does not depend on the order of the keyword list, thanks to 592228060Sbapt the formula above. */ 593228060Sbapt unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); 594228060Sbapt init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); 595228060Sbapt 596228060Sbapt unsigned int count = 0; 597228060Sbapt { 598228060Sbapt Hash_Table representatives (_total_keys, option[NOLENGTH]); 599228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 600228060Sbapt { 601228060Sbapt KeywordExt *keyword = temp->first(); 602228060Sbapt if (representatives.insert (keyword)) 603228060Sbapt count++; 604228060Sbapt } 605228060Sbapt } 606228060Sbapt 607228060Sbapt delete_selchars (); 608228060Sbapt delete[] alpha_unify; 609228060Sbapt 610228060Sbapt return count; 611228060Sbapt} 612228060Sbapt 613228060Sbapt/* Find good _alpha_inc[]. */ 614228060Sbapt 615228060Sbaptvoid 616228060SbaptSearch::find_alpha_inc () 617228060Sbapt{ 618228060Sbapt /* The goal is to choose _alpha_inc[] such that it doesn't introduce 619228060Sbapt artificial duplicates. 620228060Sbapt In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ 621228060Sbapt unsigned int duplicates_goal = count_duplicates_tuple (); 622228060Sbapt 623228060Sbapt /* Start with zero increments. This is sufficient in most cases. */ 624228060Sbapt unsigned int *current = new unsigned int [_max_key_len]; 625228060Sbapt for (int i = 0; i < _max_key_len; i++) 626228060Sbapt current[i] = 0; 627228060Sbapt unsigned int current_duplicates_count = count_duplicates_multiset (current); 628228060Sbapt 629228060Sbapt if (current_duplicates_count > duplicates_goal) 630228060Sbapt { 631228060Sbapt /* Look which _alpha_inc[i] we are free to increment. */ 632228060Sbapt unsigned int nindices; 633228060Sbapt { 634228060Sbapt nindices = 0; 635228060Sbapt PositionIterator iter = _key_positions.iterator(_max_key_len); 636228060Sbapt for (;;) 637228060Sbapt { 638228060Sbapt int key_pos = iter.next (); 639228060Sbapt if (key_pos == PositionIterator::EOS) 640228060Sbapt break; 641228060Sbapt if (key_pos != Positions::LASTCHAR) 642228060Sbapt nindices++; 643228060Sbapt } 644228060Sbapt } 645228060Sbapt 646228060Sbapt DYNAMIC_ARRAY (indices, unsigned int, nindices); 647228060Sbapt { 648228060Sbapt unsigned int j = 0; 649228060Sbapt PositionIterator iter = _key_positions.iterator(_max_key_len); 650228060Sbapt for (;;) 651228060Sbapt { 652228060Sbapt int key_pos = iter.next (); 653228060Sbapt if (key_pos == PositionIterator::EOS) 654228060Sbapt break; 655228060Sbapt if (key_pos != Positions::LASTCHAR) 656228060Sbapt indices[j++] = key_pos; 657228060Sbapt } 658228060Sbapt if (!(j == nindices)) 659228060Sbapt abort (); 660228060Sbapt } 661228060Sbapt 662228060Sbapt /* Perform several rounds of searching for a good alpha increment. 663228060Sbapt Each round reduces the number of artificial collisions by adding 664228060Sbapt an increment in a single key position. */ 665228060Sbapt DYNAMIC_ARRAY (best, unsigned int, _max_key_len); 666228060Sbapt DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len); 667228060Sbapt do 668228060Sbapt { 669228060Sbapt /* An increment of 1 is not always enough. Try higher increments 670228060Sbapt also. */ 671228060Sbapt for (unsigned int inc = 1; ; inc++) 672228060Sbapt { 673228060Sbapt unsigned int best_duplicates_count = UINT_MAX; 674228060Sbapt 675228060Sbapt for (unsigned int j = 0; j < nindices; j++) 676228060Sbapt { 677228060Sbapt memcpy (tryal, current, _max_key_len * sizeof (unsigned int)); 678228060Sbapt tryal[indices[j]] += inc; 679228060Sbapt unsigned int try_duplicates_count = 680228060Sbapt count_duplicates_multiset (tryal); 681228060Sbapt 682228060Sbapt /* We prefer 'try' to 'best' if it produces less 683228060Sbapt duplicates. */ 684228060Sbapt if (try_duplicates_count < best_duplicates_count) 685228060Sbapt { 686228060Sbapt memcpy (best, tryal, _max_key_len * sizeof (unsigned int)); 687228060Sbapt best_duplicates_count = try_duplicates_count; 688228060Sbapt } 689228060Sbapt } 690228060Sbapt 691228060Sbapt /* Stop this round when we got an improvement. */ 692228060Sbapt if (best_duplicates_count < current_duplicates_count) 693228060Sbapt { 694228060Sbapt memcpy (current, best, _max_key_len * sizeof (unsigned int)); 695228060Sbapt current_duplicates_count = best_duplicates_count; 696228060Sbapt break; 697228060Sbapt } 698228060Sbapt } 699228060Sbapt } 700228060Sbapt while (current_duplicates_count > duplicates_goal); 701228060Sbapt FREE_DYNAMIC_ARRAY (tryal); 702228060Sbapt FREE_DYNAMIC_ARRAY (best); 703228060Sbapt 704228060Sbapt if (option[DEBUG]) 705228060Sbapt { 706228060Sbapt /* Print the result. */ 707228060Sbapt fprintf (stderr, "\nComputed alpha increments: "); 708228060Sbapt bool first = true; 709228060Sbapt for (unsigned int j = nindices; j-- > 0; ) 710228060Sbapt if (current[indices[j]] != 0) 711228060Sbapt { 712228060Sbapt if (!first) 713228060Sbapt fprintf (stderr, ", "); 714228060Sbapt fprintf (stderr, "%u:+%u", 715228060Sbapt indices[j] + 1, current[indices[j]]); 716228060Sbapt first = false; 717228060Sbapt } 718228060Sbapt fprintf (stderr, "\n"); 719228060Sbapt } 720228060Sbapt FREE_DYNAMIC_ARRAY (indices); 721228060Sbapt } 722228060Sbapt 723228060Sbapt _alpha_inc = current; 724228060Sbapt _alpha_size = compute_alpha_size (_alpha_inc); 725228060Sbapt _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); 726228060Sbapt} 727228060Sbapt 728228060Sbapt/* ======================= Finding good asso_values ======================== */ 729228060Sbapt 730228060Sbapt/* Initializes the asso_values[] related parameters. */ 731228060Sbapt 732228060Sbaptvoid 733228060SbaptSearch::prepare_asso_values () 734228060Sbapt{ 735228060Sbapt KeywordExt_List *temp; 736228060Sbapt 737228060Sbapt /* Initialize each keyword's _selchars array. */ 738228060Sbapt init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc); 739228060Sbapt 740228060Sbapt /* Compute the maximum _selchars_length over all keywords. */ 741228060Sbapt _max_selchars_length = _key_positions.iterator(_max_key_len).remaining(); 742228060Sbapt 743228060Sbapt /* Check for duplicates, i.e. keywords with the same _selchars array 744228060Sbapt (and - if !option[NOLENGTH] - also the same length). 745228060Sbapt We deal with these by building an equivalence class, so that only 746228060Sbapt 1 keyword is representative of the entire collection. Only this 747228060Sbapt representative remains in the keyword list; the others are accessible 748228060Sbapt through the _duplicate_link chain, starting at the representative. 749228060Sbapt This *greatly* simplifies processing during later stages of the program. 750228060Sbapt Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */ 751228060Sbapt { 752228060Sbapt _list_len = _total_keys; 753228060Sbapt _total_duplicates = 0; 754228060Sbapt /* Make hash table for efficiency. */ 755228060Sbapt Hash_Table representatives (_list_len, option[NOLENGTH]); 756228060Sbapt 757228060Sbapt KeywordExt_List *prev = NULL; /* list node before temp */ 758228060Sbapt for (temp = _head; temp; ) 759228060Sbapt { 760228060Sbapt KeywordExt *keyword = temp->first(); 761228060Sbapt KeywordExt *other_keyword = representatives.insert (keyword); 762228060Sbapt KeywordExt_List *garbage = NULL; 763228060Sbapt 764228060Sbapt if (other_keyword) 765228060Sbapt { 766228060Sbapt _total_duplicates++; 767228060Sbapt _list_len--; 768228060Sbapt /* Remove keyword from the main list. */ 769228060Sbapt prev->rest() = temp->rest(); 770228060Sbapt garbage = temp; 771228060Sbapt /* And insert it on other_keyword's duplicate list. */ 772228060Sbapt keyword->_duplicate_link = other_keyword->_duplicate_link; 773228060Sbapt other_keyword->_duplicate_link = keyword; 774228060Sbapt 775228060Sbapt /* Complain if user hasn't enabled the duplicate option. */ 776228060Sbapt if (!option[DUP] || option[DEBUG]) 777228060Sbapt { 778228060Sbapt fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", 779228060Sbapt keyword->_allchars_length, keyword->_allchars, 780228060Sbapt other_keyword->_allchars_length, other_keyword->_allchars); 781228060Sbapt for (int j = 0; j < keyword->_selchars_length; j++) 782228060Sbapt putc (keyword->_selchars[j], stderr); 783228060Sbapt fprintf (stderr, "\".\n"); 784228060Sbapt } 785228060Sbapt } 786228060Sbapt else 787228060Sbapt { 788228060Sbapt keyword->_duplicate_link = NULL; 789228060Sbapt prev = temp; 790228060Sbapt } 791228060Sbapt temp = temp->rest(); 792228060Sbapt if (garbage) 793228060Sbapt delete garbage; 794228060Sbapt } 795228060Sbapt if (option[DEBUG]) 796228060Sbapt representatives.dump(); 797228060Sbapt } 798228060Sbapt 799228060Sbapt /* Exit program if duplicates exists and option[DUP] not set, since we 800228060Sbapt don't want to continue in this case. (We don't want to turn on 801228060Sbapt option[DUP] implicitly, because the generated code is usually much 802228060Sbapt slower. */ 803228060Sbapt if (_total_duplicates) 804228060Sbapt { 805228060Sbapt if (option[DUP]) 806228060Sbapt fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n", 807228060Sbapt _total_duplicates); 808228060Sbapt else 809228060Sbapt { 810228060Sbapt fprintf (stderr, "%d input keys have identical hash values,\n", 811228060Sbapt _total_duplicates); 812228060Sbapt if (option[POSITIONS]) 813228060Sbapt fprintf (stderr, "try different key positions or use option -D.\n"); 814228060Sbapt else 815228060Sbapt fprintf (stderr, "use option -D.\n"); 816228060Sbapt exit (1); 817228060Sbapt } 818228060Sbapt } 819228060Sbapt 820228060Sbapt /* Compute the occurrences of each character in the alphabet. */ 821228060Sbapt _occurrences = new int[_alpha_size]; 822228060Sbapt memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); 823228060Sbapt for (temp = _head; temp; temp = temp->rest()) 824228060Sbapt { 825228060Sbapt KeywordExt *keyword = temp->first(); 826228060Sbapt const unsigned int *ptr = keyword->_selchars; 827228060Sbapt for (int count = keyword->_selchars_length; count > 0; ptr++, count--) 828228060Sbapt _occurrences[*ptr]++; 829228060Sbapt } 830228060Sbapt 831228060Sbapt /* Memory allocation. */ 832228060Sbapt _asso_values = new int[_alpha_size]; 833228060Sbapt 834228060Sbapt int non_linked_length = _list_len; 835228060Sbapt unsigned int asso_value_max; 836228060Sbapt 837228060Sbapt asso_value_max = 838228060Sbapt static_cast<unsigned int>(non_linked_length * option.get_size_multiple()); 839228060Sbapt /* Round up to the next power of two. This makes it easy to ensure 840228060Sbapt an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value 841228060Sbapt being odd, it guarantees that Search::try_asso_value() will iterate 842228060Sbapt through different values for _asso_value[c]. */ 843228060Sbapt if (asso_value_max == 0) 844228060Sbapt asso_value_max = 1; 845228060Sbapt asso_value_max |= asso_value_max >> 1; 846228060Sbapt asso_value_max |= asso_value_max >> 2; 847228060Sbapt asso_value_max |= asso_value_max >> 4; 848228060Sbapt asso_value_max |= asso_value_max >> 8; 849228060Sbapt asso_value_max |= asso_value_max >> 16; 850228060Sbapt asso_value_max++; 851228060Sbapt _asso_value_max = asso_value_max; 852228060Sbapt 853228060Sbapt /* Given the bound for _asso_values[c], we have a bound for the possible 854228060Sbapt hash values, as computed in compute_hash(). */ 855228060Sbapt _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len) 856228060Sbapt + (_asso_value_max - 1) * _max_selchars_length; 857228060Sbapt /* Allocate a sparse bit vector for detection of collisions of hash 858228060Sbapt values. */ 859228060Sbapt _collision_detector = new Bool_Array (_max_hash_value + 1); 860228060Sbapt 861228060Sbapt if (option[DEBUG]) 862228060Sbapt { 863228060Sbapt fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d" 864228060Sbapt "\nmaximum size of generated hash table is %d\n", 865228060Sbapt non_linked_length, asso_value_max, _max_hash_value); 866228060Sbapt 867228060Sbapt int field_width; 868228060Sbapt 869228060Sbapt field_width = 0; 870228060Sbapt { 871228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 872228060Sbapt { 873228060Sbapt KeywordExt *keyword = temp->first(); 874228060Sbapt if (field_width < keyword->_selchars_length) 875228060Sbapt field_width = keyword->_selchars_length; 876228060Sbapt } 877228060Sbapt } 878228060Sbapt 879228060Sbapt fprintf (stderr, "\ndumping the keyword list without duplicates\n"); 880228060Sbapt fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig"); 881228060Sbapt int i = 0; 882228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 883228060Sbapt { 884228060Sbapt KeywordExt *keyword = temp->first(); 885228060Sbapt fprintf (stderr, "%9d, ", ++i); 886228060Sbapt if (field_width > keyword->_selchars_length) 887228060Sbapt fprintf (stderr, "%*s", field_width - keyword->_selchars_length, ""); 888228060Sbapt for (int j = 0; j < keyword->_selchars_length; j++) 889228060Sbapt putc (keyword->_selchars[j], stderr); 890228060Sbapt fprintf (stderr, ", %.*s\n", 891228060Sbapt keyword->_allchars_length, keyword->_allchars); 892228060Sbapt } 893228060Sbapt fprintf (stderr, "\nend of keyword list\n\n"); 894228060Sbapt } 895228060Sbapt 896228060Sbapt if (option[RANDOM] || option.get_jump () == 0) 897228060Sbapt /* We will use rand(), so initialize the random number generator. */ 898228060Sbapt srand (static_cast<long>(time (0))); 899228060Sbapt 900228060Sbapt _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ()); 901228060Sbapt _jump = option.get_jump (); 902228060Sbapt} 903228060Sbapt 904228060Sbapt/* Finds some _asso_values[] that fit. */ 905228060Sbapt 906228060Sbapt/* The idea is to choose the _asso_values[] one by one, in a way that 907228060Sbapt a choice that has been made never needs to be undone later. This 908228060Sbapt means that we split the work into several steps. Each step chooses 909228060Sbapt one or more _asso_values[c]. The result of choosing one or more 910228060Sbapt _asso_values[c] is that the partitioning of the keyword set gets 911228060Sbapt broader. 912228060Sbapt Look at this partitioning: After every step, the _asso_values[] of a 913228060Sbapt certain set C of characters are undetermined. (At the beginning, C 914228060Sbapt is the set of characters c with _occurrences[c] > 0. At the end, C 915228060Sbapt is empty.) To each keyword K, we associate the multiset of _selchars 916228060Sbapt for which the _asso_values[] are undetermined: 917228060Sbapt K --> K->_selchars intersect C. 918228060Sbapt Consider two keywords equivalent if their value under this mapping is 919228060Sbapt the same. This introduces an equivalence relation on the set of 920228060Sbapt keywords. The equivalence classes partition the keyword set. (At the 921228060Sbapt beginning, the partition is the finest possible: each K is an equivalence 922228060Sbapt class by itself, because all K have a different _selchars. At the end, 923228060Sbapt all K have been merged into a single equivalence class.) 924228060Sbapt The partition before a step is always a refinement of the partition 925228060Sbapt after the step. 926228060Sbapt We choose the steps in such a way that the partition really becomes 927228060Sbapt broader at each step. (A step that only chooses an _asso_values[c] 928228060Sbapt without changing the partition is better merged with the previous step, 929228060Sbapt to avoid useless backtracking.) */ 930228060Sbapt 931228060Sbaptstruct EquivalenceClass 932228060Sbapt{ 933228060Sbapt /* The keywords in this equivalence class. */ 934228060Sbapt KeywordExt_List * _keywords; 935228060Sbapt KeywordExt_List * _keywords_last; 936228060Sbapt /* The number of keywords in this equivalence class. */ 937228060Sbapt unsigned int _cardinality; 938228060Sbapt /* The undetermined selected characters for the keywords in this 939228060Sbapt equivalence class, as a canonically reordered multiset. */ 940228060Sbapt unsigned int * _undetermined_chars; 941228060Sbapt unsigned int _undetermined_chars_length; 942228060Sbapt 943228060Sbapt EquivalenceClass * _next; 944228060Sbapt}; 945228060Sbapt 946228060Sbaptstruct Step 947228060Sbapt{ 948228060Sbapt /* The characters whose values are being determined in this step. */ 949228060Sbapt unsigned int _changing_count; 950228060Sbapt unsigned int * _changing; 951228060Sbapt /* Exclusive upper bound for the _asso_values[c] of this step. 952228060Sbapt A power of 2. */ 953228060Sbapt unsigned int _asso_value_max; 954228060Sbapt /* The characters whose values will be determined after this step. */ 955228060Sbapt bool * _undetermined; 956228060Sbapt /* The keyword set partition after this step. */ 957228060Sbapt EquivalenceClass * _partition; 958228060Sbapt /* The expected number of iterations in this step. */ 959228060Sbapt double _expected_lower; 960228060Sbapt double _expected_upper; 961228060Sbapt 962228060Sbapt Step * _next; 963228060Sbapt}; 964228060Sbapt 965228060Sbaptstatic inline bool 966228060Sbaptequals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) 967228060Sbapt{ 968228060Sbapt while (len > 0) 969228060Sbapt { 970228060Sbapt if (*ptr1 != *ptr2) 971228060Sbapt return false; 972228060Sbapt ptr1++; 973228060Sbapt ptr2++; 974228060Sbapt len--; 975228060Sbapt } 976228060Sbapt return true; 977228060Sbapt} 978228060Sbapt 979228060SbaptEquivalenceClass * 980228060SbaptSearch::compute_partition (bool *undetermined) const 981228060Sbapt{ 982228060Sbapt EquivalenceClass *partition = NULL; 983228060Sbapt EquivalenceClass *partition_last = NULL; 984228060Sbapt for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 985228060Sbapt { 986228060Sbapt KeywordExt *keyword = temp->first(); 987228060Sbapt 988228060Sbapt /* Compute the undetermined characters for this keyword. */ 989228060Sbapt unsigned int *undetermined_chars = 990228060Sbapt new unsigned int[keyword->_selchars_length]; 991228060Sbapt unsigned int undetermined_chars_length = 0; 992228060Sbapt 993228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 994228060Sbapt if (undetermined[keyword->_selchars[i]]) 995228060Sbapt undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; 996228060Sbapt 997228060Sbapt /* Look up the equivalence class to which this keyword belongs. */ 998228060Sbapt EquivalenceClass *equclass; 999228060Sbapt for (equclass = partition; equclass; equclass = equclass->_next) 1000228060Sbapt if (equclass->_undetermined_chars_length == undetermined_chars_length 1001228060Sbapt && equals (equclass->_undetermined_chars, undetermined_chars, 1002228060Sbapt undetermined_chars_length)) 1003228060Sbapt break; 1004228060Sbapt if (equclass == NULL) 1005228060Sbapt { 1006228060Sbapt equclass = new EquivalenceClass(); 1007228060Sbapt equclass->_keywords = NULL; 1008228060Sbapt equclass->_keywords_last = NULL; 1009228060Sbapt equclass->_cardinality = 0; 1010228060Sbapt equclass->_undetermined_chars = undetermined_chars; 1011228060Sbapt equclass->_undetermined_chars_length = undetermined_chars_length; 1012228060Sbapt equclass->_next = NULL; 1013228060Sbapt if (partition) 1014228060Sbapt partition_last->_next = equclass; 1015228060Sbapt else 1016228060Sbapt partition = equclass; 1017228060Sbapt partition_last = equclass; 1018228060Sbapt } 1019228060Sbapt else 1020228060Sbapt delete[] undetermined_chars; 1021228060Sbapt 1022228060Sbapt /* Add the keyword to the equivalence class. */ 1023228060Sbapt KeywordExt_List *cons = new KeywordExt_List(keyword); 1024228060Sbapt if (equclass->_keywords) 1025228060Sbapt equclass->_keywords_last->rest() = cons; 1026228060Sbapt else 1027228060Sbapt equclass->_keywords = cons; 1028228060Sbapt equclass->_keywords_last = cons; 1029228060Sbapt equclass->_cardinality++; 1030228060Sbapt } 1031228060Sbapt 1032228060Sbapt /* Free some of the allocated memory. The caller doesn't need it. */ 1033228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1034228060Sbapt delete[] cls->_undetermined_chars; 1035228060Sbapt 1036228060Sbapt return partition; 1037228060Sbapt} 1038228060Sbapt 1039228060Sbaptstatic void 1040228060Sbaptdelete_partition (EquivalenceClass *partition) 1041228060Sbapt{ 1042228060Sbapt while (partition != NULL) 1043228060Sbapt { 1044228060Sbapt EquivalenceClass *equclass = partition; 1045228060Sbapt partition = equclass->_next; 1046228060Sbapt delete_list (equclass->_keywords); 1047228060Sbapt //delete[] equclass->_undetermined_chars; // already freed above 1048228060Sbapt delete equclass; 1049228060Sbapt } 1050228060Sbapt} 1051228060Sbapt 1052228060Sbapt/* Compute the possible number of collisions when _asso_values[c] is 1053228060Sbapt chosen, leading to the given partition. */ 1054228060Sbaptunsigned int 1055228060SbaptSearch::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const 1056228060Sbapt{ 1057228060Sbapt /* Every equivalence class p is split according to the frequency of 1058228060Sbapt occurrence of c, leading to equivalence classes p1, p2, ... 1059228060Sbapt This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions. 1060228060Sbapt Return the sum of this expression over all equivalence classes. */ 1061228060Sbapt unsigned int sum = 0; 1062228060Sbapt unsigned int m = _max_selchars_length; 1063228060Sbapt DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1); 1064228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1065228060Sbapt { 1066228060Sbapt for (unsigned int i = 0; i <= m; i++) 1067228060Sbapt split_cardinalities[i] = 0; 1068228060Sbapt 1069228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1070228060Sbapt { 1071228060Sbapt KeywordExt *keyword = temp->first(); 1072228060Sbapt 1073228060Sbapt unsigned int count = 0; 1074228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 1075228060Sbapt if (keyword->_selchars[i] == c) 1076228060Sbapt count++; 1077228060Sbapt 1078228060Sbapt split_cardinalities[count]++; 1079228060Sbapt } 1080228060Sbapt 1081228060Sbapt sum += cls->_cardinality * cls->_cardinality; 1082228060Sbapt for (unsigned int i = 0; i <= m; i++) 1083228060Sbapt sum -= split_cardinalities[i] * split_cardinalities[i]; 1084228060Sbapt } 1085228060Sbapt FREE_DYNAMIC_ARRAY (split_cardinalities); 1086228060Sbapt return sum; 1087228060Sbapt} 1088228060Sbapt 1089228060Sbapt/* Test whether adding c to the undetermined characters changes the given 1090228060Sbapt partition. */ 1091228060Sbaptbool 1092228060SbaptSearch::unchanged_partition (EquivalenceClass *partition, unsigned int c) const 1093228060Sbapt{ 1094228060Sbapt for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1095228060Sbapt { 1096228060Sbapt unsigned int first_count = UINT_MAX; 1097228060Sbapt 1098228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1099228060Sbapt { 1100228060Sbapt KeywordExt *keyword = temp->first(); 1101228060Sbapt 1102228060Sbapt unsigned int count = 0; 1103228060Sbapt for (int i = 0; i < keyword->_selchars_length; i++) 1104228060Sbapt if (keyword->_selchars[i] == c) 1105228060Sbapt count++; 1106228060Sbapt 1107228060Sbapt if (temp == cls->_keywords) 1108228060Sbapt first_count = count; 1109228060Sbapt else if (count != first_count) 1110228060Sbapt /* c would split this equivalence class. */ 1111228060Sbapt return false; 1112228060Sbapt } 1113228060Sbapt } 1114228060Sbapt return true; 1115228060Sbapt} 1116228060Sbapt 1117228060Sbaptvoid 1118228060SbaptSearch::find_asso_values () 1119228060Sbapt{ 1120228060Sbapt Step *steps; 1121228060Sbapt 1122228060Sbapt /* Determine the steps, starting with the last one. */ 1123228060Sbapt { 1124228060Sbapt bool *undetermined; 1125228060Sbapt bool *determined; 1126228060Sbapt 1127228060Sbapt steps = NULL; 1128228060Sbapt 1129228060Sbapt undetermined = new bool[_alpha_size]; 1130228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1131228060Sbapt undetermined[c] = false; 1132228060Sbapt 1133228060Sbapt determined = new bool[_alpha_size]; 1134228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1135228060Sbapt determined[c] = true; 1136228060Sbapt 1137228060Sbapt for (;;) 1138228060Sbapt { 1139228060Sbapt /* Compute the partition that needs to be refined. */ 1140228060Sbapt EquivalenceClass *partition = compute_partition (undetermined); 1141228060Sbapt 1142228060Sbapt /* Determine the main character to be chosen in this step. 1143228060Sbapt Choosing such a character c has the effect of splitting every 1144228060Sbapt equivalence class (according the the frequency of occurrence of c). 1145228060Sbapt We choose the c with the minimum number of possible collisions, 1146228060Sbapt so that characters which lead to a large number of collisions get 1147228060Sbapt handled early during the search. */ 1148228060Sbapt unsigned int chosen_c; 1149228060Sbapt unsigned int chosen_possible_collisions; 1150228060Sbapt { 1151228060Sbapt unsigned int best_c = 0; 1152228060Sbapt unsigned int best_possible_collisions = UINT_MAX; 1153228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1154228060Sbapt if (_occurrences[c] > 0 && determined[c]) 1155228060Sbapt { 1156228060Sbapt unsigned int possible_collisions = 1157228060Sbapt count_possible_collisions (partition, c); 1158228060Sbapt if (possible_collisions < best_possible_collisions) 1159228060Sbapt { 1160228060Sbapt best_c = c; 1161228060Sbapt best_possible_collisions = possible_collisions; 1162228060Sbapt } 1163228060Sbapt } 1164228060Sbapt if (best_possible_collisions == UINT_MAX) 1165228060Sbapt { 1166228060Sbapt /* All c with _occurrences[c] > 0 are undetermined. We are 1167228060Sbapt are the starting situation and don't need any more step. */ 1168228060Sbapt delete_partition (partition); 1169228060Sbapt break; 1170228060Sbapt } 1171228060Sbapt chosen_c = best_c; 1172228060Sbapt chosen_possible_collisions = best_possible_collisions; 1173228060Sbapt } 1174228060Sbapt 1175228060Sbapt /* We need one more step. */ 1176228060Sbapt Step *step = new Step(); 1177228060Sbapt 1178228060Sbapt step->_undetermined = new bool[_alpha_size]; 1179228060Sbapt memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool)); 1180228060Sbapt 1181228060Sbapt step->_partition = partition; 1182228060Sbapt 1183228060Sbapt /* Now determine how the equivalence classes will be before this 1184228060Sbapt step. */ 1185228060Sbapt undetermined[chosen_c] = true; 1186228060Sbapt partition = compute_partition (undetermined); 1187228060Sbapt 1188228060Sbapt /* Now determine which other characters should be determined in this 1189228060Sbapt step, because they will not change the equivalence classes at 1190228060Sbapt this point. It is the set of all c which, for all equivalence 1191228060Sbapt classes, have the same frequency of occurrence in every keyword 1192228060Sbapt of the equivalence class. */ 1193228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1194228060Sbapt if (_occurrences[c] > 0 && determined[c] 1195228060Sbapt && unchanged_partition (partition, c)) 1196228060Sbapt { 1197228060Sbapt undetermined[c] = true; 1198228060Sbapt determined[c] = false; 1199228060Sbapt } 1200228060Sbapt 1201228060Sbapt /* main_c must be one of these. */ 1202228060Sbapt if (determined[chosen_c]) 1203228060Sbapt abort (); 1204228060Sbapt 1205228060Sbapt /* Now the set of changing characters of this step. */ 1206228060Sbapt unsigned int changing_count; 1207228060Sbapt 1208228060Sbapt changing_count = 0; 1209228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1210228060Sbapt if (undetermined[c] && !step->_undetermined[c]) 1211228060Sbapt changing_count++; 1212228060Sbapt 1213228060Sbapt unsigned int *changing = new unsigned int[changing_count]; 1214228060Sbapt changing_count = 0; 1215228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1216228060Sbapt if (undetermined[c] && !step->_undetermined[c]) 1217228060Sbapt changing[changing_count++] = c; 1218228060Sbapt 1219228060Sbapt step->_changing = changing; 1220228060Sbapt step->_changing_count = changing_count; 1221228060Sbapt 1222228060Sbapt step->_asso_value_max = _asso_value_max; 1223228060Sbapt 1224228060Sbapt step->_expected_lower = 1225228060Sbapt exp (static_cast<double>(chosen_possible_collisions) 1226228060Sbapt / static_cast<double>(_max_hash_value)); 1227228060Sbapt step->_expected_upper = 1228228060Sbapt exp (static_cast<double>(chosen_possible_collisions) 1229228060Sbapt / static_cast<double>(_asso_value_max)); 1230228060Sbapt 1231228060Sbapt delete_partition (partition); 1232228060Sbapt 1233228060Sbapt step->_next = steps; 1234228060Sbapt steps = step; 1235228060Sbapt } 1236228060Sbapt 1237228060Sbapt delete[] determined; 1238228060Sbapt delete[] undetermined; 1239228060Sbapt } 1240228060Sbapt 1241228060Sbapt if (option[DEBUG]) 1242228060Sbapt { 1243228060Sbapt unsigned int stepno = 0; 1244228060Sbapt for (Step *step = steps; step; step = step->_next) 1245228060Sbapt { 1246228060Sbapt stepno++; 1247228060Sbapt fprintf (stderr, "Step %u chooses _asso_values[", stepno); 1248228060Sbapt for (unsigned int i = 0; i < step->_changing_count; i++) 1249228060Sbapt { 1250228060Sbapt if (i > 0) 1251228060Sbapt fprintf (stderr, ","); 1252228060Sbapt fprintf (stderr, "'%c'", step->_changing[i]); 1253228060Sbapt } 1254228060Sbapt fprintf (stderr, "], expected number of iterations between %g and %g.\n", 1255228060Sbapt step->_expected_lower, step->_expected_upper); 1256228060Sbapt fprintf (stderr, "Keyword equivalence classes:\n"); 1257228060Sbapt for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1258228060Sbapt { 1259228060Sbapt fprintf (stderr, "\n"); 1260228060Sbapt for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1261228060Sbapt { 1262228060Sbapt KeywordExt *keyword = temp->first(); 1263228060Sbapt fprintf (stderr, " %.*s\n", 1264228060Sbapt keyword->_allchars_length, keyword->_allchars); 1265228060Sbapt } 1266228060Sbapt } 1267228060Sbapt fprintf (stderr, "\n"); 1268228060Sbapt } 1269228060Sbapt } 1270228060Sbapt 1271228060Sbapt /* Initialize _asso_values[]. (The value given here matters only 1272228060Sbapt for those c which occur in all keywords with equal multiplicity.) */ 1273228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1274228060Sbapt _asso_values[c] = 0; 1275228060Sbapt 1276228060Sbapt unsigned int stepno = 0; 1277228060Sbapt for (Step *step = steps; step; step = step->_next) 1278228060Sbapt { 1279228060Sbapt stepno++; 1280228060Sbapt 1281228060Sbapt /* Initialize the asso_values[]. */ 1282228060Sbapt unsigned int k = step->_changing_count; 1283228060Sbapt for (unsigned int i = 0; i < k; i++) 1284228060Sbapt { 1285228060Sbapt unsigned int c = step->_changing[i]; 1286228060Sbapt _asso_values[c] = 1287228060Sbapt (_initial_asso_value < 0 ? rand () : _initial_asso_value) 1288228060Sbapt & (step->_asso_value_max - 1); 1289228060Sbapt } 1290228060Sbapt 1291228060Sbapt unsigned int iterations = 0; 1292228060Sbapt DYNAMIC_ARRAY (iter, unsigned int, k); 1293228060Sbapt for (unsigned int i = 0; i < k; i++) 1294228060Sbapt iter[i] = 0; 1295228060Sbapt unsigned int ii = (_jump != 0 ? k - 1 : 0); 1296228060Sbapt 1297228060Sbapt for (;;) 1298228060Sbapt { 1299228060Sbapt /* Test whether these asso_values[] lead to collisions among 1300228060Sbapt the equivalence classes that should be collision-free. */ 1301228060Sbapt bool has_collision = false; 1302228060Sbapt for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1303228060Sbapt { 1304228060Sbapt /* Iteration Number array is a win, O(1) initialization time! */ 1305228060Sbapt _collision_detector->clear (); 1306228060Sbapt 1307228060Sbapt for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest()) 1308228060Sbapt { 1309228060Sbapt KeywordExt *keyword = ptr->first(); 1310228060Sbapt 1311228060Sbapt /* Compute the new hash code for the keyword, leaving apart 1312228060Sbapt the yet undetermined asso_values[]. */ 1313228060Sbapt int hashcode; 1314228060Sbapt { 1315228060Sbapt int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1316228060Sbapt const unsigned int *p = keyword->_selchars; 1317228060Sbapt int i = keyword->_selchars_length; 1318228060Sbapt for (; i > 0; p++, i--) 1319228060Sbapt if (!step->_undetermined[*p]) 1320228060Sbapt sum += _asso_values[*p]; 1321228060Sbapt hashcode = sum; 1322228060Sbapt } 1323228060Sbapt 1324228060Sbapt /* See whether it collides with another keyword's hash code, 1325228060Sbapt from the same equivalence class. */ 1326228060Sbapt if (_collision_detector->set_bit (hashcode)) 1327228060Sbapt { 1328228060Sbapt has_collision = true; 1329228060Sbapt break; 1330228060Sbapt } 1331228060Sbapt } 1332228060Sbapt 1333228060Sbapt /* Don't need to continue looking at the other equivalence 1334228060Sbapt classes if we already have found a collision. */ 1335228060Sbapt if (has_collision) 1336228060Sbapt break; 1337228060Sbapt } 1338228060Sbapt 1339228060Sbapt iterations++; 1340228060Sbapt if (!has_collision) 1341228060Sbapt break; 1342228060Sbapt 1343228060Sbapt /* Try other asso_values[]. */ 1344228060Sbapt if (_jump != 0) 1345228060Sbapt { 1346228060Sbapt /* The way we try various values for 1347228060Sbapt asso_values[step->_changing[0],...step->_changing[k-1]] 1348228060Sbapt is like this: 1349228060Sbapt for (bound = 0,1,...) 1350228060Sbapt for (ii = 0,...,k-1) 1351228060Sbapt iter[ii] := bound 1352228060Sbapt iter[0..ii-1] := values <= bound 1353228060Sbapt iter[ii+1..k-1] := values < bound 1354228060Sbapt and 1355228060Sbapt asso_values[step->_changing[i]] = 1356228060Sbapt _initial_asso_value + iter[i] * _jump. 1357228060Sbapt This makes it more likely to find small asso_values[]. 1358228060Sbapt */ 1359228060Sbapt unsigned int bound = iter[ii]; 1360228060Sbapt unsigned int i = 0; 1361228060Sbapt while (i < ii) 1362228060Sbapt { 1363228060Sbapt unsigned int c = step->_changing[i]; 1364228060Sbapt iter[i]++; 1365228060Sbapt _asso_values[c] = 1366228060Sbapt (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1367228060Sbapt if (iter[i] <= bound) 1368228060Sbapt goto found_next; 1369228060Sbapt _asso_values[c] = 1370228060Sbapt (_asso_values[c] - iter[i] * _jump) 1371228060Sbapt & (step->_asso_value_max - 1); 1372228060Sbapt iter[i] = 0; 1373228060Sbapt i++; 1374228060Sbapt } 1375228060Sbapt i = ii + 1; 1376228060Sbapt while (i < k) 1377228060Sbapt { 1378228060Sbapt unsigned int c = step->_changing[i]; 1379228060Sbapt iter[i]++; 1380228060Sbapt _asso_values[c] = 1381228060Sbapt (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1382228060Sbapt if (iter[i] < bound) 1383228060Sbapt goto found_next; 1384228060Sbapt _asso_values[c] = 1385228060Sbapt (_asso_values[c] - iter[i] * _jump) 1386228060Sbapt & (step->_asso_value_max - 1); 1387228060Sbapt iter[i] = 0; 1388228060Sbapt i++; 1389228060Sbapt } 1390228060Sbapt /* Switch from one ii to the next. */ 1391228060Sbapt { 1392228060Sbapt unsigned int c = step->_changing[ii]; 1393228060Sbapt _asso_values[c] = 1394228060Sbapt (_asso_values[c] - bound * _jump) 1395228060Sbapt & (step->_asso_value_max - 1); 1396228060Sbapt iter[ii] = 0; 1397228060Sbapt } 1398228060Sbapt /* Here all iter[i] == 0. */ 1399228060Sbapt ii++; 1400228060Sbapt if (ii == k) 1401228060Sbapt { 1402228060Sbapt ii = 0; 1403228060Sbapt bound++; 1404228060Sbapt if (bound == step->_asso_value_max) 1405228060Sbapt { 1406228060Sbapt /* Out of search space! We can either backtrack, or 1407228060Sbapt increase the available search space of this step. 1408228060Sbapt It seems simpler to choose the latter solution. */ 1409228060Sbapt step->_asso_value_max = 2 * step->_asso_value_max; 1410228060Sbapt if (step->_asso_value_max > _asso_value_max) 1411228060Sbapt { 1412228060Sbapt _asso_value_max = step->_asso_value_max; 1413228060Sbapt /* Reinitialize _max_hash_value. */ 1414228060Sbapt _max_hash_value = 1415228060Sbapt (option[NOLENGTH] ? 0 : _max_key_len) 1416228060Sbapt + (_asso_value_max - 1) * _max_selchars_length; 1417228060Sbapt /* Reinitialize _collision_detector. */ 1418228060Sbapt delete _collision_detector; 1419228060Sbapt _collision_detector = 1420228060Sbapt new Bool_Array (_max_hash_value + 1); 1421228060Sbapt } 1422228060Sbapt } 1423228060Sbapt } 1424228060Sbapt { 1425228060Sbapt unsigned int c = step->_changing[ii]; 1426228060Sbapt iter[ii] = bound; 1427228060Sbapt _asso_values[c] = 1428228060Sbapt (_asso_values[c] + bound * _jump) 1429228060Sbapt & (step->_asso_value_max - 1); 1430228060Sbapt } 1431228060Sbapt found_next: ; 1432228060Sbapt } 1433228060Sbapt else 1434228060Sbapt { 1435228060Sbapt /* Random. */ 1436228060Sbapt unsigned int c = step->_changing[ii]; 1437228060Sbapt _asso_values[c] = 1438228060Sbapt (_asso_values[c] + rand ()) & (step->_asso_value_max - 1); 1439228060Sbapt /* Next time, change the next c. */ 1440228060Sbapt ii++; 1441228060Sbapt if (ii == k) 1442228060Sbapt ii = 0; 1443228060Sbapt } 1444228060Sbapt } 1445228060Sbapt FREE_DYNAMIC_ARRAY (iter); 1446228060Sbapt 1447228060Sbapt if (option[DEBUG]) 1448228060Sbapt { 1449228060Sbapt fprintf (stderr, "Step %u chose _asso_values[", stepno); 1450228060Sbapt for (unsigned int i = 0; i < step->_changing_count; i++) 1451228060Sbapt { 1452228060Sbapt if (i > 0) 1453228060Sbapt fprintf (stderr, ","); 1454228060Sbapt fprintf (stderr, "'%c'", step->_changing[i]); 1455228060Sbapt } 1456228060Sbapt fprintf (stderr, "] in %u iterations.\n", iterations); 1457228060Sbapt } 1458228060Sbapt } 1459228060Sbapt 1460228060Sbapt /* Free allocated memory. */ 1461228060Sbapt while (steps != NULL) 1462228060Sbapt { 1463228060Sbapt Step *step = steps; 1464228060Sbapt steps = step->_next; 1465228060Sbapt delete[] step->_changing; 1466228060Sbapt delete[] step->_undetermined; 1467228060Sbapt delete_partition (step->_partition); 1468228060Sbapt delete step; 1469228060Sbapt } 1470228060Sbapt} 1471228060Sbapt 1472228060Sbapt/* Computes a keyword's hash value, relative to the current _asso_values[], 1473228060Sbapt and stores it in keyword->_hash_value. */ 1474228060Sbapt 1475228060Sbaptinline int 1476228060SbaptSearch::compute_hash (KeywordExt *keyword) const 1477228060Sbapt{ 1478228060Sbapt int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1479228060Sbapt 1480228060Sbapt const unsigned int *p = keyword->_selchars; 1481228060Sbapt int i = keyword->_selchars_length; 1482228060Sbapt for (; i > 0; p++, i--) 1483228060Sbapt sum += _asso_values[*p]; 1484228060Sbapt 1485228060Sbapt return keyword->_hash_value = sum; 1486228060Sbapt} 1487228060Sbapt 1488228060Sbapt/* Finds good _asso_values[]. */ 1489228060Sbapt 1490228060Sbaptvoid 1491228060SbaptSearch::find_good_asso_values () 1492228060Sbapt{ 1493228060Sbapt prepare_asso_values (); 1494228060Sbapt 1495228060Sbapt /* Search for good _asso_values[]. */ 1496228060Sbapt int asso_iteration; 1497228060Sbapt if ((asso_iteration = option.get_asso_iterations ()) == 0) 1498228060Sbapt /* Try only the given _initial_asso_value and _jump. */ 1499228060Sbapt find_asso_values (); 1500228060Sbapt else 1501228060Sbapt { 1502228060Sbapt /* Try different pairs of _initial_asso_value and _jump, in the 1503228060Sbapt following order: 1504228060Sbapt (0, 1) 1505228060Sbapt (1, 1) 1506228060Sbapt (2, 1) (0, 3) 1507228060Sbapt (3, 1) (1, 3) 1508228060Sbapt (4, 1) (2, 3) (0, 5) 1509228060Sbapt (5, 1) (3, 3) (1, 5) 1510228060Sbapt ..... */ 1511228060Sbapt KeywordExt_List *saved_head = _head; 1512228060Sbapt int best_initial_asso_value = 0; 1513228060Sbapt int best_jump = 1; 1514228060Sbapt int *best_asso_values = new int[_alpha_size]; 1515228060Sbapt int best_collisions = INT_MAX; 1516228060Sbapt int best_max_hash_value = INT_MAX; 1517228060Sbapt 1518228060Sbapt _initial_asso_value = 0; _jump = 1; 1519228060Sbapt for (;;) 1520228060Sbapt { 1521228060Sbapt /* Restore the keyword list in its original order. */ 1522228060Sbapt _head = copy_list (saved_head); 1523228060Sbapt /* Find good _asso_values[]. */ 1524228060Sbapt find_asso_values (); 1525228060Sbapt /* Test whether it is the best solution so far. */ 1526228060Sbapt int collisions = 0; 1527228060Sbapt int max_hash_value = INT_MIN; 1528228060Sbapt _collision_detector->clear (); 1529228060Sbapt for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1530228060Sbapt { 1531228060Sbapt KeywordExt *keyword = ptr->first(); 1532228060Sbapt int hashcode = compute_hash (keyword); 1533228060Sbapt if (max_hash_value < hashcode) 1534228060Sbapt max_hash_value = hashcode; 1535228060Sbapt if (_collision_detector->set_bit (hashcode)) 1536228060Sbapt collisions++; 1537228060Sbapt } 1538228060Sbapt if (collisions < best_collisions 1539228060Sbapt || (collisions == best_collisions 1540228060Sbapt && max_hash_value < best_max_hash_value)) 1541228060Sbapt { 1542228060Sbapt memcpy (best_asso_values, _asso_values, 1543228060Sbapt _alpha_size * sizeof (_asso_values[0])); 1544228060Sbapt best_collisions = collisions; 1545228060Sbapt best_max_hash_value = max_hash_value; 1546228060Sbapt } 1547228060Sbapt /* Delete the copied keyword list. */ 1548228060Sbapt delete_list (_head); 1549228060Sbapt 1550228060Sbapt if (--asso_iteration == 0) 1551228060Sbapt break; 1552228060Sbapt /* Prepare for next iteration. */ 1553228060Sbapt if (_initial_asso_value >= 2) 1554228060Sbapt _initial_asso_value -= 2, _jump += 2; 1555228060Sbapt else 1556228060Sbapt _initial_asso_value += _jump, _jump = 1; 1557228060Sbapt } 1558228060Sbapt _head = saved_head; 1559228060Sbapt /* Install the best found asso_values. */ 1560228060Sbapt _initial_asso_value = best_initial_asso_value; 1561228060Sbapt _jump = best_jump; 1562228060Sbapt memcpy (_asso_values, best_asso_values, 1563228060Sbapt _alpha_size * sizeof (_asso_values[0])); 1564228060Sbapt delete[] best_asso_values; 1565228060Sbapt /* The keywords' _hash_value fields are recomputed below. */ 1566228060Sbapt } 1567228060Sbapt} 1568228060Sbapt 1569228060Sbapt/* ========================================================================= */ 1570228060Sbapt 1571228060Sbapt/* Comparison function for sorting by increasing _hash_value. */ 1572228060Sbaptstatic bool 1573228060Sbaptless_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2) 1574228060Sbapt{ 1575228060Sbapt return keyword1->_hash_value < keyword2->_hash_value; 1576228060Sbapt} 1577228060Sbapt 1578228060Sbapt/* Sorts the keyword list by hash value. */ 1579228060Sbapt 1580228060Sbaptvoid 1581228060SbaptSearch::sort () 1582228060Sbapt{ 1583228060Sbapt _head = mergesort_list (_head, less_by_hash_value); 1584228060Sbapt} 1585228060Sbapt 1586228060Sbaptvoid 1587228060SbaptSearch::optimize () 1588228060Sbapt{ 1589228060Sbapt /* Preparations. */ 1590228060Sbapt prepare (); 1591228060Sbapt 1592228060Sbapt /* Step 1: Finding good byte positions. */ 1593228060Sbapt find_positions (); 1594228060Sbapt 1595228060Sbapt /* Step 2: Finding good alpha increments. */ 1596228060Sbapt find_alpha_inc (); 1597228060Sbapt 1598228060Sbapt /* Step 3: Finding good asso_values. */ 1599228060Sbapt find_good_asso_values (); 1600228060Sbapt 1601228060Sbapt /* Make one final check, just to make sure nothing weird happened.... */ 1602228060Sbapt _collision_detector->clear (); 1603228060Sbapt for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest()) 1604228060Sbapt { 1605228060Sbapt KeywordExt *curr = curr_ptr->first(); 1606228060Sbapt unsigned int hashcode = compute_hash (curr); 1607228060Sbapt if (_collision_detector->set_bit (hashcode)) 1608228060Sbapt { 1609228060Sbapt /* This shouldn't happen. proj1, proj2, proj3 must have been 1610228060Sbapt computed to be injective on the given keyword set. */ 1611228060Sbapt fprintf (stderr, 1612228060Sbapt "\nInternal error, unexpected duplicate hash code\n"); 1613228060Sbapt if (option[POSITIONS]) 1614228060Sbapt fprintf (stderr, "try options -m or -r, or use new key positions.\n\n"); 1615228060Sbapt else 1616228060Sbapt fprintf (stderr, "try options -m or -r.\n\n"); 1617228060Sbapt exit (1); 1618228060Sbapt } 1619228060Sbapt } 1620228060Sbapt 1621228060Sbapt /* Sorts the keyword list by hash value. */ 1622228060Sbapt sort (); 1623228060Sbapt 1624228060Sbapt /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely 1625228060Sbapt necessary, but speeds up the lookup function in many cases of lookup 1626228060Sbapt failure: no string comparison is needed once the hash value of a string 1627228060Sbapt is larger than the hash value of any keyword. */ 1628228060Sbapt int max_hash_value; 1629228060Sbapt { 1630228060Sbapt KeywordExt_List *temp; 1631228060Sbapt for (temp = _head; temp->rest(); temp = temp->rest()) 1632228060Sbapt ; 1633228060Sbapt max_hash_value = temp->first()->_hash_value; 1634228060Sbapt } 1635228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1636228060Sbapt if (_occurrences[c] == 0) 1637228060Sbapt _asso_values[c] = max_hash_value + 1; 1638228060Sbapt 1639228060Sbapt /* Propagate unified asso_values. */ 1640228060Sbapt if (_alpha_unify) 1641228060Sbapt for (unsigned int c = 0; c < _alpha_size; c++) 1642228060Sbapt if (_alpha_unify[c] != c) 1643228060Sbapt _asso_values[c] = _asso_values[_alpha_unify[c]]; 1644228060Sbapt} 1645228060Sbapt 1646228060Sbapt/* Prints out some diagnostics upon completion. */ 1647228060Sbapt 1648228060SbaptSearch::~Search () 1649228060Sbapt{ 1650228060Sbapt delete _collision_detector; 1651228060Sbapt if (option[DEBUG]) 1652228060Sbapt { 1653228060Sbapt fprintf (stderr, "\ndumping occurrence and associated values tables\n"); 1654228060Sbapt 1655228060Sbapt for (unsigned int i = 0; i < _alpha_size; i++) 1656228060Sbapt if (_occurrences[i]) 1657228060Sbapt fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", 1658228060Sbapt i, _asso_values[i], i, _occurrences[i]); 1659228060Sbapt 1660228060Sbapt fprintf (stderr, "end table dumping\n"); 1661228060Sbapt 1662228060Sbapt fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d" 1663228060Sbapt "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", 1664228060Sbapt _list_len, _total_keys, _total_duplicates, _max_key_len); 1665228060Sbapt 1666228060Sbapt int field_width = _max_selchars_length; 1667228060Sbapt fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", 1668228060Sbapt field_width, "selchars"); 1669228060Sbapt for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1670228060Sbapt { 1671228060Sbapt fprintf (stderr, "%11d,%11d,%6d, ", 1672228060Sbapt ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index); 1673228060Sbapt if (field_width > ptr->first()->_selchars_length) 1674228060Sbapt fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, ""); 1675228060Sbapt for (int j = 0; j < ptr->first()->_selchars_length; j++) 1676228060Sbapt putc (ptr->first()->_selchars[j], stderr); 1677228060Sbapt fprintf (stderr, ", %.*s\n", 1678228060Sbapt ptr->first()->_allchars_length, ptr->first()->_allchars); 1679228060Sbapt } 1680228060Sbapt 1681228060Sbapt fprintf (stderr, "End dumping list.\n\n"); 1682228060Sbapt } 1683228060Sbapt delete[] _asso_values; 1684228060Sbapt delete[] _occurrences; 1685228060Sbapt delete[] _alpha_unify; 1686228060Sbapt delete[] _alpha_inc; 1687228060Sbapt} 1688