1/* Search algorithm. 2 Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc. 3 Written by Douglas C. Schmidt <schmidt@ics.uci.edu> 4 and Bruno Haible <bruno@clisp.org>. 5 6 This file is part of GNU GPERF. 7 8 GNU GPERF is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GNU GPERF is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING. 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23/* Specification. */ 24#include "search.h" 25 26#include <stdio.h> 27#include <stdlib.h> /* declares exit(), rand(), srand() */ 28#include <string.h> /* declares memset(), memcmp() */ 29#include <time.h> /* declares time() */ 30#include <math.h> /* declares exp() */ 31#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ 32#include "options.h" 33#include "hash-table.h" 34#include "config.h" 35 36/* ============================== Portability ============================== */ 37 38/* Assume ISO C++ 'for' scoping rule. */ 39/* This code is used to work around scoping issues with visual studio 6 from 40 * 1998. Comment it out here to queisce numerous -Wdangling-else warnings 41 * from clang. 42#define for if (0) ; else for */ 43 44/* Dynamically allocated array with dynamic extent: 45 46 Example: 47 DYNAMIC_ARRAY (my_array, int, n); 48 ... 49 FREE_DYNAMIC_ARRAY (my_array); 50 51 Attention: depending on your implementation my_array is either the array 52 itself or a pointer to the array! Always use my_array only as expression! 53 */ 54#if HAVE_DYNAMIC_ARRAY 55 #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size] 56 #define FREE_DYNAMIC_ARRAY(var) 57#else 58 #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size] 59 #define FREE_DYNAMIC_ARRAY(var) delete[] var 60#endif 61 62/* ================================ Theory ================================= */ 63 64/* The general form of the hash function is 65 66 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) 67 + len (keyword) 68 69 where Pos is a set of byte positions, 70 each alpha_inc[i] is a nonnegative integer, 71 each asso_values[c] is a nonnegative integer, 72 len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise. 73 74 Theorem 1: If all keywords are different, there is a set Pos such that 75 all tuples (keyword[i] : i in Pos) are different. 76 77 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there 78 are nonnegative integers alpha_inc[i] such that all multisets 79 {keyword[i] + alpha_inc[i] : i in Pos} are different. 80 81 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. 82 83 Theorem 3: If all multisets selchars[keyword] are different, there are 84 nonnegative integers asso_values[c] such that all hash values 85 sum (asso_values[c] : c in selchars[keyword]) are different. 86 87 Based on these three facts, we find the hash function in three steps: 88 89 Step 1 (Finding good byte positions): 90 Find a set Pos, as small as possible, such that all tuples 91 (keyword[i] : i in Pos) are different. 92 93 Step 2 (Finding good alpha increments): 94 Find nonnegative integers alpha_inc[i], as many of them as possible being 95 zero, and the others being as small as possible, such that all multisets 96 {keyword[i] + alpha_inc[i] : i in Pos} are different. 97 98 Step 3 (Finding good asso_values): 99 Find asso_values[c] such that all hash (keyword) are different. 100 101 In other words, each step finds a projection that is injective on the 102 given finite set: 103 proj1 : String --> Map (Pos --> N) 104 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) 105 proj3 : Map (Pos --> N) / S(Pos) --> N 106 where 107 N denotes the set of nonnegative integers, 108 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and 109 S(Pos) is the symmetric group over Pos. 110 111 This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight 112 modifications apply: 113 proj1 : String --> Map (Pos --> N) x N 114 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N 115 proj3 : Map (Pos --> N) / S(Pos) x N --> N 116 117 For a case-insensitive hash function, the general form is 118 119 hash (keyword) = 120 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) 121 + len (keyword) 122 123 where alpha_unify[c] is chosen so that an upper/lower case change in 124 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. 125 */ 126 127/* ==================== Initialization and Preparation ===================== */ 128 129Search::Search (KeywordExt_List *list) 130 : _head (list) 131{ 132} 133 134void 135Search::prepare () 136{ 137 /* Compute the total number of keywords. */ 138 _total_keys = 0; 139 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 140 _total_keys++; 141 142 /* Compute the minimum and maximum keyword length. */ 143 _max_key_len = INT_MIN; 144 _min_key_len = INT_MAX; 145 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 146 { 147 KeywordExt *keyword = temp->first(); 148 149 if (_max_key_len < keyword->_allchars_length) 150 _max_key_len = keyword->_allchars_length; 151 if (_min_key_len > keyword->_allchars_length) 152 _min_key_len = keyword->_allchars_length; 153 } 154 155 /* Exit program if an empty string is used as keyword, since the comparison 156 expressions don't work correctly for looking up an empty string. */ 157 if (_min_key_len == 0) 158 { 159 fprintf (stderr, "Empty input keyword is not allowed.\n" 160 "To recognize an empty input keyword, your code should check for\n" 161 "len == 0 before calling the gperf generated lookup function.\n"); 162 exit (1); 163 } 164 165 /* Exit program if the characters in the keywords are not in the required 166 range. */ 167 if (option[SEVENBIT]) 168 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 169 { 170 KeywordExt *keyword = temp->first(); 171 172 const char *k = keyword->_allchars; 173 for (int i = keyword->_allchars_length; i > 0; k++, i--) 174 if (!(static_cast<unsigned char>(*k) < 128)) 175 { 176 fprintf (stderr, "Option --seven-bit has been specified,\n" 177 "but keyword \"%.*s\" contains non-ASCII characters.\n" 178 "Try removing option --seven-bit.\n", 179 keyword->_allchars_length, keyword->_allchars); 180 exit (1); 181 } 182 } 183} 184 185/* ====================== Finding good byte positions ====================== */ 186 187/* Computes the upper bound on the indices passed to asso_values[], 188 assuming no alpha_increments. */ 189unsigned int 190Search::compute_alpha_size () const 191{ 192 return (option[SEVENBIT] ? 128 : 256); 193} 194 195/* Computes the unification rules between different asso_values[c], 196 assuming no alpha_increments. */ 197unsigned int * 198Search::compute_alpha_unify () const 199{ 200 if (option[UPPERLOWER]) 201 { 202 /* Uppercase to lowercase mapping. */ 203 unsigned int alpha_size = compute_alpha_size(); 204 unsigned int *alpha_unify = new unsigned int[alpha_size]; 205 for (unsigned int c = 0; c < alpha_size; c++) 206 alpha_unify[c] = c; 207 for (unsigned int c = 'A'; c <= 'Z'; c++) 208 alpha_unify[c] = c + ('a'-'A'); 209 return alpha_unify; 210 } 211 else 212 /* Identity mapping. */ 213 return NULL; 214} 215 216/* Initializes each keyword's _selchars array. */ 217void 218Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const 219{ 220 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 221 temp->first()->init_selchars_tuple(positions, alpha_unify); 222} 223 224/* Deletes each keyword's _selchars array. */ 225void 226Search::delete_selchars () const 227{ 228 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 229 temp->first()->delete_selchars(); 230} 231 232/* Count the duplicate keywords that occur with a given set of positions. 233 In other words, it returns the difference 234 # K - # proj1 (K) 235 where K is the multiset of given keywords. */ 236unsigned int 237Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const 238{ 239 /* Run through the keyword list and count the duplicates incrementally. 240 The result does not depend on the order of the keyword list, thanks to 241 the formula above. */ 242 init_selchars_tuple (positions, alpha_unify); 243 244 unsigned int count = 0; 245 { 246 Hash_Table representatives (_total_keys, option[NOLENGTH]); 247 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 248 { 249 KeywordExt *keyword = temp->first(); 250 if (representatives.insert (keyword)) 251 count++; 252 } 253 } 254 255 delete_selchars (); 256 257 return count; 258} 259 260/* Find good key positions. */ 261 262void 263Search::find_positions () 264{ 265 /* If the user gave the key positions, we use them. */ 266 if (option[POSITIONS]) 267 { 268 _key_positions = option.get_key_positions(); 269 return; 270 } 271 272 /* Compute preliminary alpha_unify table. */ 273 unsigned int *alpha_unify = compute_alpha_unify (); 274 275 /* 1. Find positions that must occur in order to distinguish duplicates. */ 276 Positions mandatory; 277 278 if (!option[DUP]) 279 { 280 for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) 281 { 282 KeywordExt *keyword1 = l1->first(); 283 for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) 284 { 285 KeywordExt *keyword2 = l2->first(); 286 287 /* If keyword1 and keyword2 have the same length and differ 288 in just one position, and it is not the last character, 289 this position is mandatory. */ 290 if (keyword1->_allchars_length == keyword2->_allchars_length) 291 { 292 int n = keyword1->_allchars_length; 293 int i; 294 for (i = 0; i < n - 1; i++) 295 { 296 unsigned char c1 = keyword1->_allchars[i]; 297 unsigned char c2 = keyword2->_allchars[i]; 298 if (option[UPPERLOWER]) 299 { 300 if (c1 >= 'A' && c1 <= 'Z') 301 c1 += 'a' - 'A'; 302 if (c2 >= 'A' && c2 <= 'Z') 303 c2 += 'a' - 'A'; 304 } 305 if (c1 != c2) 306 break; 307 } 308 if (i < n - 1) 309 { 310 int j; 311 for (j = i + 1; j < n; j++) 312 { 313 unsigned char c1 = keyword1->_allchars[j]; 314 unsigned char c2 = keyword2->_allchars[j]; 315 if (option[UPPERLOWER]) 316 { 317 if (c1 >= 'A' && c1 <= 'Z') 318 c1 += 'a' - 'A'; 319 if (c2 >= 'A' && c2 <= 'Z') 320 c2 += 'a' - 'A'; 321 } 322 if (c1 != c2) 323 break; 324 } 325 if (j >= n) 326 { 327 /* Position i is mandatory. */ 328 if (!mandatory.contains (i)) 329 mandatory.add (i); 330 } 331 } 332 } 333 } 334 } 335 } 336 337 /* 2. Add positions, as long as this decreases the duplicates count. */ 338 int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 339 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); 340 Positions current = mandatory; 341 unsigned int current_duplicates_count = 342 count_duplicates_tuple (current, alpha_unify); 343 for (;;) 344 { 345 Positions best; 346 unsigned int best_duplicates_count = UINT_MAX; 347 348 for (int i = imax; i >= -1; i--) 349 if (!current.contains (i)) 350 { 351 Positions tryal = current; 352 tryal.add (i); 353 unsigned int try_duplicates_count = 354 count_duplicates_tuple (tryal, alpha_unify); 355 356 /* We prefer 'try' to 'best' if it produces less duplicates, 357 or if it produces the same number of duplicates but with 358 a more efficient hash function. */ 359 if (try_duplicates_count < best_duplicates_count 360 || (try_duplicates_count == best_duplicates_count && i >= 0)) 361 { 362 best = tryal; 363 best_duplicates_count = try_duplicates_count; 364 } 365 } 366 367 /* Stop adding positions when it gives no improvement. */ 368 if (best_duplicates_count >= current_duplicates_count) 369 break; 370 371 current = best; 372 current_duplicates_count = best_duplicates_count; 373 } 374 375 /* 3. Remove positions, as long as this doesn't increase the duplicates 376 count. */ 377 for (;;) 378 { 379 Positions best; 380 unsigned int best_duplicates_count = UINT_MAX; 381 382 for (int i = imax; i >= -1; i--) 383 if (current.contains (i) && !mandatory.contains (i)) 384 { 385 Positions tryal = current; 386 tryal.remove (i); 387 unsigned int try_duplicates_count = 388 count_duplicates_tuple (tryal, alpha_unify); 389 390 /* We prefer 'try' to 'best' if it produces less duplicates, 391 or if it produces the same number of duplicates but with 392 a more efficient hash function. */ 393 if (try_duplicates_count < best_duplicates_count 394 || (try_duplicates_count == best_duplicates_count && i == -1)) 395 { 396 best = tryal; 397 best_duplicates_count = try_duplicates_count; 398 } 399 } 400 401 /* Stop removing positions when it gives no improvement. */ 402 if (best_duplicates_count > current_duplicates_count) 403 break; 404 405 current = best; 406 current_duplicates_count = best_duplicates_count; 407 } 408 409 /* 4. Replace two positions by one, as long as this doesn't increase the 410 duplicates count. */ 411 for (;;) 412 { 413 Positions best; 414 unsigned int best_duplicates_count = UINT_MAX; 415 416 for (int i1 = imax; i1 >= -1; i1--) 417 if (current.contains (i1) && !mandatory.contains (i1)) 418 for (int i2 = imax; i2 >= -1; i2--) 419 if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) 420 for (int i3 = imax; i3 >= 0; i3--) 421 if (!current.contains (i3)) 422 { 423 Positions tryal = current; 424 tryal.remove (i1); 425 tryal.remove (i2); 426 tryal.add (i3); 427 unsigned int try_duplicates_count = 428 count_duplicates_tuple (tryal, alpha_unify); 429 430 /* We prefer 'try' to 'best' if it produces less duplicates, 431 or if it produces the same number of duplicates but with 432 a more efficient hash function. */ 433 if (try_duplicates_count < best_duplicates_count 434 || (try_duplicates_count == best_duplicates_count 435 && (i1 == -1 || i2 == -1 || i3 >= 0))) 436 { 437 best = tryal; 438 best_duplicates_count = try_duplicates_count; 439 } 440 } 441 442 /* Stop removing positions when it gives no improvement. */ 443 if (best_duplicates_count > current_duplicates_count) 444 break; 445 446 current = best; 447 current_duplicates_count = best_duplicates_count; 448 } 449 450 /* That's it. Hope it's good enough. */ 451 _key_positions = current; 452 453 if (option[DEBUG]) 454 { 455 /* Print the result. */ 456 fprintf (stderr, "\nComputed positions: "); 457 PositionReverseIterator iter = _key_positions.reviterator(); 458 bool seen_lastchar = false; 459 bool first = true; 460 for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; ) 461 { 462 if (!first) 463 fprintf (stderr, ", "); 464 if (i == Positions::LASTCHAR) 465 seen_lastchar = true; 466 else 467 { 468 fprintf (stderr, "%d", i + 1); 469 first = false; 470 } 471 } 472 if (seen_lastchar) 473 { 474 if (!first) 475 fprintf (stderr, ", "); 476 fprintf (stderr, "$"); 477 } 478 fprintf (stderr, "\n"); 479 } 480 481 /* Free preliminary alpha_unify table. */ 482 delete[] alpha_unify; 483} 484 485/* Count the duplicate keywords that occur with the found set of positions. 486 In other words, it returns the difference 487 # K - # proj1 (K) 488 where K is the multiset of given keywords. */ 489unsigned int 490Search::count_duplicates_tuple () const 491{ 492 unsigned int *alpha_unify = compute_alpha_unify (); 493 unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); 494 delete[] alpha_unify; 495 return count; 496} 497 498/* ===================== Finding good alpha increments ===================== */ 499 500/* Computes the upper bound on the indices passed to asso_values[]. */ 501unsigned int 502Search::compute_alpha_size (const unsigned int *alpha_inc) const 503{ 504 unsigned int max_alpha_inc = 0; 505 for (int i = 0; i < _max_key_len; i++) 506 if (max_alpha_inc < alpha_inc[i]) 507 max_alpha_inc = alpha_inc[i]; 508 return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; 509} 510 511/* Computes the unification rules between different asso_values[c]. */ 512unsigned int * 513Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const 514{ 515 if (option[UPPERLOWER]) 516 { 517 /* Without alpha increments, we would simply unify 518 'A' -> 'a', ..., 'Z' -> 'z'. 519 But when a keyword contains at position i a character c, 520 we have the constraint 521 asso_values[tolower(c) + alpha_inc[i]] == 522 asso_values[toupper(c) + alpha_inc[i]]. 523 This introduces a unification 524 toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. 525 Note that this unification can extend outside the range of 526 ASCII letters! But still every unified character pair is at 527 a distance of 'a'-'A' = 32, or (after chained unification) 528 at a multiple of 32. So in the end the alpha_unify vector has 529 the form c -> c + 32 * f(c) where f(c) is a nonnegative 530 integer. */ 531 unsigned int alpha_size = compute_alpha_size (alpha_inc); 532 533 unsigned int *alpha_unify = new unsigned int[alpha_size]; 534 for (unsigned int c = 0; c < alpha_size; c++) 535 alpha_unify[c] = c; 536 537 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 538 { 539 KeywordExt *keyword = temp->first(); 540 541 /* Iterate through the selected character positions. */ 542 PositionIterator iter = positions.iterator(keyword->_allchars_length); 543 544 for (int i; (i = iter.next ()) != PositionIterator::EOS; ) 545 { 546 unsigned int c; 547 if (i == Positions::LASTCHAR) 548 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); 549 else if (i < keyword->_allchars_length) 550 c = static_cast<unsigned char>(keyword->_allchars[i]); 551 else 552 abort (); 553 if (c >= 'A' && c <= 'Z') 554 c += 'a' - 'A'; 555 if (c >= 'a' && c <= 'z') 556 { 557 if (i != Positions::LASTCHAR) 558 c += alpha_inc[i]; 559 /* Unify c with c - ('a'-'A'). */ 560 unsigned int d = alpha_unify[c]; 561 unsigned int b = c - ('a'-'A'); 562 for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) 563 alpha_unify[a] = d; 564 } 565 } 566 } 567 return alpha_unify; 568 } 569 else 570 /* Identity mapping. */ 571 return NULL; 572} 573 574/* Initializes each keyword's _selchars array. */ 575void 576Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const 577{ 578 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 579 temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc); 580} 581 582/* Count the duplicate keywords that occur with the given set of positions 583 and a given alpha_inc[] array. 584 In other words, it returns the difference 585 # K - # proj2 (proj1 (K)) 586 where K is the multiset of given keywords. */ 587unsigned int 588Search::count_duplicates_multiset (const unsigned int *alpha_inc) const 589{ 590 /* Run through the keyword list and count the duplicates incrementally. 591 The result does not depend on the order of the keyword list, thanks to 592 the formula above. */ 593 unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); 594 init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); 595 596 unsigned int count = 0; 597 { 598 Hash_Table representatives (_total_keys, option[NOLENGTH]); 599 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 600 { 601 KeywordExt *keyword = temp->first(); 602 if (representatives.insert (keyword)) 603 count++; 604 } 605 } 606 607 delete_selchars (); 608 delete[] alpha_unify; 609 610 return count; 611} 612 613/* Find good _alpha_inc[]. */ 614 615void 616Search::find_alpha_inc () 617{ 618 /* The goal is to choose _alpha_inc[] such that it doesn't introduce 619 artificial duplicates. 620 In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ 621 unsigned int duplicates_goal = count_duplicates_tuple (); 622 623 /* Start with zero increments. This is sufficient in most cases. */ 624 unsigned int *current = new unsigned int [_max_key_len]; 625 for (int i = 0; i < _max_key_len; i++) 626 current[i] = 0; 627 unsigned int current_duplicates_count = count_duplicates_multiset (current); 628 629 if (current_duplicates_count > duplicates_goal) 630 { 631 /* Look which _alpha_inc[i] we are free to increment. */ 632 unsigned int nindices; 633 { 634 nindices = 0; 635 PositionIterator iter = _key_positions.iterator(_max_key_len); 636 for (;;) 637 { 638 int key_pos = iter.next (); 639 if (key_pos == PositionIterator::EOS) 640 break; 641 if (key_pos != Positions::LASTCHAR) 642 nindices++; 643 } 644 } 645 646 DYNAMIC_ARRAY (indices, unsigned int, nindices); 647 { 648 unsigned int j = 0; 649 PositionIterator iter = _key_positions.iterator(_max_key_len); 650 for (;;) 651 { 652 int key_pos = iter.next (); 653 if (key_pos == PositionIterator::EOS) 654 break; 655 if (key_pos != Positions::LASTCHAR) 656 indices[j++] = key_pos; 657 } 658 if (!(j == nindices)) 659 abort (); 660 } 661 662 /* Perform several rounds of searching for a good alpha increment. 663 Each round reduces the number of artificial collisions by adding 664 an increment in a single key position. */ 665 DYNAMIC_ARRAY (best, unsigned int, _max_key_len); 666 DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len); 667 do 668 { 669 /* An increment of 1 is not always enough. Try higher increments 670 also. */ 671 for (unsigned int inc = 1; ; inc++) 672 { 673 unsigned int best_duplicates_count = UINT_MAX; 674 675 for (unsigned int j = 0; j < nindices; j++) 676 { 677 memcpy (tryal, current, _max_key_len * sizeof (unsigned int)); 678 tryal[indices[j]] += inc; 679 unsigned int try_duplicates_count = 680 count_duplicates_multiset (tryal); 681 682 /* We prefer 'try' to 'best' if it produces less 683 duplicates. */ 684 if (try_duplicates_count < best_duplicates_count) 685 { 686 memcpy (best, tryal, _max_key_len * sizeof (unsigned int)); 687 best_duplicates_count = try_duplicates_count; 688 } 689 } 690 691 /* Stop this round when we got an improvement. */ 692 if (best_duplicates_count < current_duplicates_count) 693 { 694 memcpy (current, best, _max_key_len * sizeof (unsigned int)); 695 current_duplicates_count = best_duplicates_count; 696 break; 697 } 698 } 699 } 700 while (current_duplicates_count > duplicates_goal); 701 FREE_DYNAMIC_ARRAY (tryal); 702 FREE_DYNAMIC_ARRAY (best); 703 704 if (option[DEBUG]) 705 { 706 /* Print the result. */ 707 fprintf (stderr, "\nComputed alpha increments: "); 708 bool first = true; 709 for (unsigned int j = nindices; j-- > 0; ) 710 if (current[indices[j]] != 0) 711 { 712 if (!first) 713 fprintf (stderr, ", "); 714 fprintf (stderr, "%u:+%u", 715 indices[j] + 1, current[indices[j]]); 716 first = false; 717 } 718 fprintf (stderr, "\n"); 719 } 720 FREE_DYNAMIC_ARRAY (indices); 721 } 722 723 _alpha_inc = current; 724 _alpha_size = compute_alpha_size (_alpha_inc); 725 _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); 726} 727 728/* ======================= Finding good asso_values ======================== */ 729 730/* Initializes the asso_values[] related parameters. */ 731 732void 733Search::prepare_asso_values () 734{ 735 KeywordExt_List *temp; 736 737 /* Initialize each keyword's _selchars array. */ 738 init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc); 739 740 /* Compute the maximum _selchars_length over all keywords. */ 741 _max_selchars_length = _key_positions.iterator(_max_key_len).remaining(); 742 743 /* Check for duplicates, i.e. keywords with the same _selchars array 744 (and - if !option[NOLENGTH] - also the same length). 745 We deal with these by building an equivalence class, so that only 746 1 keyword is representative of the entire collection. Only this 747 representative remains in the keyword list; the others are accessible 748 through the _duplicate_link chain, starting at the representative. 749 This *greatly* simplifies processing during later stages of the program. 750 Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */ 751 { 752 _list_len = _total_keys; 753 _total_duplicates = 0; 754 /* Make hash table for efficiency. */ 755 Hash_Table representatives (_list_len, option[NOLENGTH]); 756 757 KeywordExt_List *prev = NULL; /* list node before temp */ 758 for (temp = _head; temp; ) 759 { 760 KeywordExt *keyword = temp->first(); 761 KeywordExt *other_keyword = representatives.insert (keyword); 762 KeywordExt_List *garbage = NULL; 763 764 if (other_keyword) 765 { 766 _total_duplicates++; 767 _list_len--; 768 /* Remove keyword from the main list. */ 769 prev->rest() = temp->rest(); 770 garbage = temp; 771 /* And insert it on other_keyword's duplicate list. */ 772 keyword->_duplicate_link = other_keyword->_duplicate_link; 773 other_keyword->_duplicate_link = keyword; 774 775 /* Complain if user hasn't enabled the duplicate option. */ 776 if (!option[DUP] || option[DEBUG]) 777 { 778 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", 779 keyword->_allchars_length, keyword->_allchars, 780 other_keyword->_allchars_length, other_keyword->_allchars); 781 for (int j = 0; j < keyword->_selchars_length; j++) 782 putc (keyword->_selchars[j], stderr); 783 fprintf (stderr, "\".\n"); 784 } 785 } 786 else 787 { 788 keyword->_duplicate_link = NULL; 789 prev = temp; 790 } 791 temp = temp->rest(); 792 if (garbage) 793 delete garbage; 794 } 795 if (option[DEBUG]) 796 representatives.dump(); 797 } 798 799 /* Exit program if duplicates exists and option[DUP] not set, since we 800 don't want to continue in this case. (We don't want to turn on 801 option[DUP] implicitly, because the generated code is usually much 802 slower. */ 803 if (_total_duplicates) 804 { 805 if (option[DUP]) 806 fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n", 807 _total_duplicates); 808 else 809 { 810 fprintf (stderr, "%d input keys have identical hash values,\n", 811 _total_duplicates); 812 if (option[POSITIONS]) 813 fprintf (stderr, "try different key positions or use option -D.\n"); 814 else 815 fprintf (stderr, "use option -D.\n"); 816 exit (1); 817 } 818 } 819 820 /* Compute the occurrences of each character in the alphabet. */ 821 _occurrences = new int[_alpha_size]; 822 memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); 823 for (temp = _head; temp; temp = temp->rest()) 824 { 825 KeywordExt *keyword = temp->first(); 826 const unsigned int *ptr = keyword->_selchars; 827 for (int count = keyword->_selchars_length; count > 0; ptr++, count--) 828 _occurrences[*ptr]++; 829 } 830 831 /* Memory allocation. */ 832 _asso_values = new int[_alpha_size]; 833 834 int non_linked_length = _list_len; 835 unsigned int asso_value_max; 836 837 asso_value_max = 838 static_cast<unsigned int>(non_linked_length * option.get_size_multiple()); 839 /* Round up to the next power of two. This makes it easy to ensure 840 an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value 841 being odd, it guarantees that Search::try_asso_value() will iterate 842 through different values for _asso_value[c]. */ 843 if (asso_value_max == 0) 844 asso_value_max = 1; 845 asso_value_max |= asso_value_max >> 1; 846 asso_value_max |= asso_value_max >> 2; 847 asso_value_max |= asso_value_max >> 4; 848 asso_value_max |= asso_value_max >> 8; 849 asso_value_max |= asso_value_max >> 16; 850 asso_value_max++; 851 _asso_value_max = asso_value_max; 852 853 /* Given the bound for _asso_values[c], we have a bound for the possible 854 hash values, as computed in compute_hash(). */ 855 _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len) 856 + (_asso_value_max - 1) * _max_selchars_length; 857 /* Allocate a sparse bit vector for detection of collisions of hash 858 values. */ 859 _collision_detector = new Bool_Array (_max_hash_value + 1); 860 861 if (option[DEBUG]) 862 { 863 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d" 864 "\nmaximum size of generated hash table is %d\n", 865 non_linked_length, asso_value_max, _max_hash_value); 866 867 int field_width; 868 869 field_width = 0; 870 { 871 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 872 { 873 KeywordExt *keyword = temp->first(); 874 if (field_width < keyword->_selchars_length) 875 field_width = keyword->_selchars_length; 876 } 877 } 878 879 fprintf (stderr, "\ndumping the keyword list without duplicates\n"); 880 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig"); 881 int i = 0; 882 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 883 { 884 KeywordExt *keyword = temp->first(); 885 fprintf (stderr, "%9d, ", ++i); 886 if (field_width > keyword->_selchars_length) 887 fprintf (stderr, "%*s", field_width - keyword->_selchars_length, ""); 888 for (int j = 0; j < keyword->_selchars_length; j++) 889 putc (keyword->_selchars[j], stderr); 890 fprintf (stderr, ", %.*s\n", 891 keyword->_allchars_length, keyword->_allchars); 892 } 893 fprintf (stderr, "\nend of keyword list\n\n"); 894 } 895 896 if (option[RANDOM] || option.get_jump () == 0) 897 /* We will use rand(), so initialize the random number generator. */ 898 srand (static_cast<long>(time (0))); 899 900 _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ()); 901 _jump = option.get_jump (); 902} 903 904/* Finds some _asso_values[] that fit. */ 905 906/* The idea is to choose the _asso_values[] one by one, in a way that 907 a choice that has been made never needs to be undone later. This 908 means that we split the work into several steps. Each step chooses 909 one or more _asso_values[c]. The result of choosing one or more 910 _asso_values[c] is that the partitioning of the keyword set gets 911 broader. 912 Look at this partitioning: After every step, the _asso_values[] of a 913 certain set C of characters are undetermined. (At the beginning, C 914 is the set of characters c with _occurrences[c] > 0. At the end, C 915 is empty.) To each keyword K, we associate the multiset of _selchars 916 for which the _asso_values[] are undetermined: 917 K --> K->_selchars intersect C. 918 Consider two keywords equivalent if their value under this mapping is 919 the same. This introduces an equivalence relation on the set of 920 keywords. The equivalence classes partition the keyword set. (At the 921 beginning, the partition is the finest possible: each K is an equivalence 922 class by itself, because all K have a different _selchars. At the end, 923 all K have been merged into a single equivalence class.) 924 The partition before a step is always a refinement of the partition 925 after the step. 926 We choose the steps in such a way that the partition really becomes 927 broader at each step. (A step that only chooses an _asso_values[c] 928 without changing the partition is better merged with the previous step, 929 to avoid useless backtracking.) */ 930 931struct EquivalenceClass 932{ 933 /* The keywords in this equivalence class. */ 934 KeywordExt_List * _keywords; 935 KeywordExt_List * _keywords_last; 936 /* The number of keywords in this equivalence class. */ 937 unsigned int _cardinality; 938 /* The undetermined selected characters for the keywords in this 939 equivalence class, as a canonically reordered multiset. */ 940 unsigned int * _undetermined_chars; 941 unsigned int _undetermined_chars_length; 942 943 EquivalenceClass * _next; 944}; 945 946struct Step 947{ 948 /* The characters whose values are being determined in this step. */ 949 unsigned int _changing_count; 950 unsigned int * _changing; 951 /* Exclusive upper bound for the _asso_values[c] of this step. 952 A power of 2. */ 953 unsigned int _asso_value_max; 954 /* The characters whose values will be determined after this step. */ 955 bool * _undetermined; 956 /* The keyword set partition after this step. */ 957 EquivalenceClass * _partition; 958 /* The expected number of iterations in this step. */ 959 double _expected_lower; 960 double _expected_upper; 961 962 Step * _next; 963}; 964 965static inline bool 966equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) 967{ 968 while (len > 0) 969 { 970 if (*ptr1 != *ptr2) 971 return false; 972 ptr1++; 973 ptr2++; 974 len--; 975 } 976 return true; 977} 978 979EquivalenceClass * 980Search::compute_partition (bool *undetermined) const 981{ 982 EquivalenceClass *partition = NULL; 983 EquivalenceClass *partition_last = NULL; 984 for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) 985 { 986 KeywordExt *keyword = temp->first(); 987 988 /* Compute the undetermined characters for this keyword. */ 989 unsigned int *undetermined_chars = 990 new unsigned int[keyword->_selchars_length]; 991 unsigned int undetermined_chars_length = 0; 992 993 for (int i = 0; i < keyword->_selchars_length; i++) 994 if (undetermined[keyword->_selchars[i]]) 995 undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; 996 997 /* Look up the equivalence class to which this keyword belongs. */ 998 EquivalenceClass *equclass; 999 for (equclass = partition; equclass; equclass = equclass->_next) 1000 if (equclass->_undetermined_chars_length == undetermined_chars_length 1001 && equals (equclass->_undetermined_chars, undetermined_chars, 1002 undetermined_chars_length)) 1003 break; 1004 if (equclass == NULL) 1005 { 1006 equclass = new EquivalenceClass(); 1007 equclass->_keywords = NULL; 1008 equclass->_keywords_last = NULL; 1009 equclass->_cardinality = 0; 1010 equclass->_undetermined_chars = undetermined_chars; 1011 equclass->_undetermined_chars_length = undetermined_chars_length; 1012 equclass->_next = NULL; 1013 if (partition) 1014 partition_last->_next = equclass; 1015 else 1016 partition = equclass; 1017 partition_last = equclass; 1018 } 1019 else 1020 delete[] undetermined_chars; 1021 1022 /* Add the keyword to the equivalence class. */ 1023 KeywordExt_List *cons = new KeywordExt_List(keyword); 1024 if (equclass->_keywords) 1025 equclass->_keywords_last->rest() = cons; 1026 else 1027 equclass->_keywords = cons; 1028 equclass->_keywords_last = cons; 1029 equclass->_cardinality++; 1030 } 1031 1032 /* Free some of the allocated memory. The caller doesn't need it. */ 1033 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1034 delete[] cls->_undetermined_chars; 1035 1036 return partition; 1037} 1038 1039static void 1040delete_partition (EquivalenceClass *partition) 1041{ 1042 while (partition != NULL) 1043 { 1044 EquivalenceClass *equclass = partition; 1045 partition = equclass->_next; 1046 delete_list (equclass->_keywords); 1047 //delete[] equclass->_undetermined_chars; // already freed above 1048 delete equclass; 1049 } 1050} 1051 1052/* Compute the possible number of collisions when _asso_values[c] is 1053 chosen, leading to the given partition. */ 1054unsigned int 1055Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const 1056{ 1057 /* Every equivalence class p is split according to the frequency of 1058 occurrence of c, leading to equivalence classes p1, p2, ... 1059 This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions. 1060 Return the sum of this expression over all equivalence classes. */ 1061 unsigned int sum = 0; 1062 unsigned int m = _max_selchars_length; 1063 DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1); 1064 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1065 { 1066 for (unsigned int i = 0; i <= m; i++) 1067 split_cardinalities[i] = 0; 1068 1069 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1070 { 1071 KeywordExt *keyword = temp->first(); 1072 1073 unsigned int count = 0; 1074 for (int i = 0; i < keyword->_selchars_length; i++) 1075 if (keyword->_selchars[i] == c) 1076 count++; 1077 1078 split_cardinalities[count]++; 1079 } 1080 1081 sum += cls->_cardinality * cls->_cardinality; 1082 for (unsigned int i = 0; i <= m; i++) 1083 sum -= split_cardinalities[i] * split_cardinalities[i]; 1084 } 1085 FREE_DYNAMIC_ARRAY (split_cardinalities); 1086 return sum; 1087} 1088 1089/* Test whether adding c to the undetermined characters changes the given 1090 partition. */ 1091bool 1092Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const 1093{ 1094 for (EquivalenceClass *cls = partition; cls; cls = cls->_next) 1095 { 1096 unsigned int first_count = UINT_MAX; 1097 1098 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1099 { 1100 KeywordExt *keyword = temp->first(); 1101 1102 unsigned int count = 0; 1103 for (int i = 0; i < keyword->_selchars_length; i++) 1104 if (keyword->_selchars[i] == c) 1105 count++; 1106 1107 if (temp == cls->_keywords) 1108 first_count = count; 1109 else if (count != first_count) 1110 /* c would split this equivalence class. */ 1111 return false; 1112 } 1113 } 1114 return true; 1115} 1116 1117void 1118Search::find_asso_values () 1119{ 1120 Step *steps; 1121 1122 /* Determine the steps, starting with the last one. */ 1123 { 1124 bool *undetermined; 1125 bool *determined; 1126 1127 steps = NULL; 1128 1129 undetermined = new bool[_alpha_size]; 1130 for (unsigned int c = 0; c < _alpha_size; c++) 1131 undetermined[c] = false; 1132 1133 determined = new bool[_alpha_size]; 1134 for (unsigned int c = 0; c < _alpha_size; c++) 1135 determined[c] = true; 1136 1137 for (;;) 1138 { 1139 /* Compute the partition that needs to be refined. */ 1140 EquivalenceClass *partition = compute_partition (undetermined); 1141 1142 /* Determine the main character to be chosen in this step. 1143 Choosing such a character c has the effect of splitting every 1144 equivalence class (according the the frequency of occurrence of c). 1145 We choose the c with the minimum number of possible collisions, 1146 so that characters which lead to a large number of collisions get 1147 handled early during the search. */ 1148 unsigned int chosen_c; 1149 unsigned int chosen_possible_collisions; 1150 { 1151 unsigned int best_c = 0; 1152 unsigned int best_possible_collisions = UINT_MAX; 1153 for (unsigned int c = 0; c < _alpha_size; c++) 1154 if (_occurrences[c] > 0 && determined[c]) 1155 { 1156 unsigned int possible_collisions = 1157 count_possible_collisions (partition, c); 1158 if (possible_collisions < best_possible_collisions) 1159 { 1160 best_c = c; 1161 best_possible_collisions = possible_collisions; 1162 } 1163 } 1164 if (best_possible_collisions == UINT_MAX) 1165 { 1166 /* All c with _occurrences[c] > 0 are undetermined. We are 1167 are the starting situation and don't need any more step. */ 1168 delete_partition (partition); 1169 break; 1170 } 1171 chosen_c = best_c; 1172 chosen_possible_collisions = best_possible_collisions; 1173 } 1174 1175 /* We need one more step. */ 1176 Step *step = new Step(); 1177 1178 step->_undetermined = new bool[_alpha_size]; 1179 memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool)); 1180 1181 step->_partition = partition; 1182 1183 /* Now determine how the equivalence classes will be before this 1184 step. */ 1185 undetermined[chosen_c] = true; 1186 partition = compute_partition (undetermined); 1187 1188 /* Now determine which other characters should be determined in this 1189 step, because they will not change the equivalence classes at 1190 this point. It is the set of all c which, for all equivalence 1191 classes, have the same frequency of occurrence in every keyword 1192 of the equivalence class. */ 1193 for (unsigned int c = 0; c < _alpha_size; c++) 1194 if (_occurrences[c] > 0 && determined[c] 1195 && unchanged_partition (partition, c)) 1196 { 1197 undetermined[c] = true; 1198 determined[c] = false; 1199 } 1200 1201 /* main_c must be one of these. */ 1202 if (determined[chosen_c]) 1203 abort (); 1204 1205 /* Now the set of changing characters of this step. */ 1206 unsigned int changing_count; 1207 1208 changing_count = 0; 1209 for (unsigned int c = 0; c < _alpha_size; c++) 1210 if (undetermined[c] && !step->_undetermined[c]) 1211 changing_count++; 1212 1213 unsigned int *changing = new unsigned int[changing_count]; 1214 changing_count = 0; 1215 for (unsigned int c = 0; c < _alpha_size; c++) 1216 if (undetermined[c] && !step->_undetermined[c]) 1217 changing[changing_count++] = c; 1218 1219 step->_changing = changing; 1220 step->_changing_count = changing_count; 1221 1222 step->_asso_value_max = _asso_value_max; 1223 1224 step->_expected_lower = 1225 exp (static_cast<double>(chosen_possible_collisions) 1226 / static_cast<double>(_max_hash_value)); 1227 step->_expected_upper = 1228 exp (static_cast<double>(chosen_possible_collisions) 1229 / static_cast<double>(_asso_value_max)); 1230 1231 delete_partition (partition); 1232 1233 step->_next = steps; 1234 steps = step; 1235 } 1236 1237 delete[] determined; 1238 delete[] undetermined; 1239 } 1240 1241 if (option[DEBUG]) 1242 { 1243 unsigned int stepno = 0; 1244 for (Step *step = steps; step; step = step->_next) 1245 { 1246 stepno++; 1247 fprintf (stderr, "Step %u chooses _asso_values[", stepno); 1248 for (unsigned int i = 0; i < step->_changing_count; i++) 1249 { 1250 if (i > 0) 1251 fprintf (stderr, ","); 1252 fprintf (stderr, "'%c'", step->_changing[i]); 1253 } 1254 fprintf (stderr, "], expected number of iterations between %g and %g.\n", 1255 step->_expected_lower, step->_expected_upper); 1256 fprintf (stderr, "Keyword equivalence classes:\n"); 1257 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1258 { 1259 fprintf (stderr, "\n"); 1260 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) 1261 { 1262 KeywordExt *keyword = temp->first(); 1263 fprintf (stderr, " %.*s\n", 1264 keyword->_allchars_length, keyword->_allchars); 1265 } 1266 } 1267 fprintf (stderr, "\n"); 1268 } 1269 } 1270 1271 /* Initialize _asso_values[]. (The value given here matters only 1272 for those c which occur in all keywords with equal multiplicity.) */ 1273 for (unsigned int c = 0; c < _alpha_size; c++) 1274 _asso_values[c] = 0; 1275 1276 unsigned int stepno = 0; 1277 for (Step *step = steps; step; step = step->_next) 1278 { 1279 stepno++; 1280 1281 /* Initialize the asso_values[]. */ 1282 unsigned int k = step->_changing_count; 1283 for (unsigned int i = 0; i < k; i++) 1284 { 1285 unsigned int c = step->_changing[i]; 1286 _asso_values[c] = 1287 (_initial_asso_value < 0 ? rand () : _initial_asso_value) 1288 & (step->_asso_value_max - 1); 1289 } 1290 1291 unsigned int iterations = 0; 1292 DYNAMIC_ARRAY (iter, unsigned int, k); 1293 for (unsigned int i = 0; i < k; i++) 1294 iter[i] = 0; 1295 unsigned int ii = (_jump != 0 ? k - 1 : 0); 1296 1297 for (;;) 1298 { 1299 /* Test whether these asso_values[] lead to collisions among 1300 the equivalence classes that should be collision-free. */ 1301 bool has_collision = false; 1302 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) 1303 { 1304 /* Iteration Number array is a win, O(1) initialization time! */ 1305 _collision_detector->clear (); 1306 1307 for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest()) 1308 { 1309 KeywordExt *keyword = ptr->first(); 1310 1311 /* Compute the new hash code for the keyword, leaving apart 1312 the yet undetermined asso_values[]. */ 1313 int hashcode; 1314 { 1315 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1316 const unsigned int *p = keyword->_selchars; 1317 int i = keyword->_selchars_length; 1318 for (; i > 0; p++, i--) 1319 if (!step->_undetermined[*p]) 1320 sum += _asso_values[*p]; 1321 hashcode = sum; 1322 } 1323 1324 /* See whether it collides with another keyword's hash code, 1325 from the same equivalence class. */ 1326 if (_collision_detector->set_bit (hashcode)) 1327 { 1328 has_collision = true; 1329 break; 1330 } 1331 } 1332 1333 /* Don't need to continue looking at the other equivalence 1334 classes if we already have found a collision. */ 1335 if (has_collision) 1336 break; 1337 } 1338 1339 iterations++; 1340 if (!has_collision) 1341 break; 1342 1343 /* Try other asso_values[]. */ 1344 if (_jump != 0) 1345 { 1346 /* The way we try various values for 1347 asso_values[step->_changing[0],...step->_changing[k-1]] 1348 is like this: 1349 for (bound = 0,1,...) 1350 for (ii = 0,...,k-1) 1351 iter[ii] := bound 1352 iter[0..ii-1] := values <= bound 1353 iter[ii+1..k-1] := values < bound 1354 and 1355 asso_values[step->_changing[i]] = 1356 _initial_asso_value + iter[i] * _jump. 1357 This makes it more likely to find small asso_values[]. 1358 */ 1359 unsigned int bound = iter[ii]; 1360 unsigned int i = 0; 1361 while (i < ii) 1362 { 1363 unsigned int c = step->_changing[i]; 1364 iter[i]++; 1365 _asso_values[c] = 1366 (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1367 if (iter[i] <= bound) 1368 goto found_next; 1369 _asso_values[c] = 1370 (_asso_values[c] - iter[i] * _jump) 1371 & (step->_asso_value_max - 1); 1372 iter[i] = 0; 1373 i++; 1374 } 1375 i = ii + 1; 1376 while (i < k) 1377 { 1378 unsigned int c = step->_changing[i]; 1379 iter[i]++; 1380 _asso_values[c] = 1381 (_asso_values[c] + _jump) & (step->_asso_value_max - 1); 1382 if (iter[i] < bound) 1383 goto found_next; 1384 _asso_values[c] = 1385 (_asso_values[c] - iter[i] * _jump) 1386 & (step->_asso_value_max - 1); 1387 iter[i] = 0; 1388 i++; 1389 } 1390 /* Switch from one ii to the next. */ 1391 { 1392 unsigned int c = step->_changing[ii]; 1393 _asso_values[c] = 1394 (_asso_values[c] - bound * _jump) 1395 & (step->_asso_value_max - 1); 1396 iter[ii] = 0; 1397 } 1398 /* Here all iter[i] == 0. */ 1399 ii++; 1400 if (ii == k) 1401 { 1402 ii = 0; 1403 bound++; 1404 if (bound == step->_asso_value_max) 1405 { 1406 /* Out of search space! We can either backtrack, or 1407 increase the available search space of this step. 1408 It seems simpler to choose the latter solution. */ 1409 step->_asso_value_max = 2 * step->_asso_value_max; 1410 if (step->_asso_value_max > _asso_value_max) 1411 { 1412 _asso_value_max = step->_asso_value_max; 1413 /* Reinitialize _max_hash_value. */ 1414 _max_hash_value = 1415 (option[NOLENGTH] ? 0 : _max_key_len) 1416 + (_asso_value_max - 1) * _max_selchars_length; 1417 /* Reinitialize _collision_detector. */ 1418 delete _collision_detector; 1419 _collision_detector = 1420 new Bool_Array (_max_hash_value + 1); 1421 } 1422 } 1423 } 1424 { 1425 unsigned int c = step->_changing[ii]; 1426 iter[ii] = bound; 1427 _asso_values[c] = 1428 (_asso_values[c] + bound * _jump) 1429 & (step->_asso_value_max - 1); 1430 } 1431 found_next: ; 1432 } 1433 else 1434 { 1435 /* Random. */ 1436 unsigned int c = step->_changing[ii]; 1437 _asso_values[c] = 1438 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1); 1439 /* Next time, change the next c. */ 1440 ii++; 1441 if (ii == k) 1442 ii = 0; 1443 } 1444 } 1445 FREE_DYNAMIC_ARRAY (iter); 1446 1447 if (option[DEBUG]) 1448 { 1449 fprintf (stderr, "Step %u chose _asso_values[", stepno); 1450 for (unsigned int i = 0; i < step->_changing_count; i++) 1451 { 1452 if (i > 0) 1453 fprintf (stderr, ","); 1454 fprintf (stderr, "'%c'", step->_changing[i]); 1455 } 1456 fprintf (stderr, "] in %u iterations.\n", iterations); 1457 } 1458 } 1459 1460 /* Free allocated memory. */ 1461 while (steps != NULL) 1462 { 1463 Step *step = steps; 1464 steps = step->_next; 1465 delete[] step->_changing; 1466 delete[] step->_undetermined; 1467 delete_partition (step->_partition); 1468 delete step; 1469 } 1470} 1471 1472/* Computes a keyword's hash value, relative to the current _asso_values[], 1473 and stores it in keyword->_hash_value. */ 1474 1475inline int 1476Search::compute_hash (KeywordExt *keyword) const 1477{ 1478 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length; 1479 1480 const unsigned int *p = keyword->_selchars; 1481 int i = keyword->_selchars_length; 1482 for (; i > 0; p++, i--) 1483 sum += _asso_values[*p]; 1484 1485 return keyword->_hash_value = sum; 1486} 1487 1488/* Finds good _asso_values[]. */ 1489 1490void 1491Search::find_good_asso_values () 1492{ 1493 prepare_asso_values (); 1494 1495 /* Search for good _asso_values[]. */ 1496 int asso_iteration; 1497 if ((asso_iteration = option.get_asso_iterations ()) == 0) 1498 /* Try only the given _initial_asso_value and _jump. */ 1499 find_asso_values (); 1500 else 1501 { 1502 /* Try different pairs of _initial_asso_value and _jump, in the 1503 following order: 1504 (0, 1) 1505 (1, 1) 1506 (2, 1) (0, 3) 1507 (3, 1) (1, 3) 1508 (4, 1) (2, 3) (0, 5) 1509 (5, 1) (3, 3) (1, 5) 1510 ..... */ 1511 KeywordExt_List *saved_head = _head; 1512 int best_initial_asso_value = 0; 1513 int best_jump = 1; 1514 int *best_asso_values = new int[_alpha_size]; 1515 int best_collisions = INT_MAX; 1516 int best_max_hash_value = INT_MAX; 1517 1518 _initial_asso_value = 0; _jump = 1; 1519 for (;;) 1520 { 1521 /* Restore the keyword list in its original order. */ 1522 _head = copy_list (saved_head); 1523 /* Find good _asso_values[]. */ 1524 find_asso_values (); 1525 /* Test whether it is the best solution so far. */ 1526 int collisions = 0; 1527 int max_hash_value = INT_MIN; 1528 _collision_detector->clear (); 1529 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1530 { 1531 KeywordExt *keyword = ptr->first(); 1532 int hashcode = compute_hash (keyword); 1533 if (max_hash_value < hashcode) 1534 max_hash_value = hashcode; 1535 if (_collision_detector->set_bit (hashcode)) 1536 collisions++; 1537 } 1538 if (collisions < best_collisions 1539 || (collisions == best_collisions 1540 && max_hash_value < best_max_hash_value)) 1541 { 1542 memcpy (best_asso_values, _asso_values, 1543 _alpha_size * sizeof (_asso_values[0])); 1544 best_collisions = collisions; 1545 best_max_hash_value = max_hash_value; 1546 } 1547 /* Delete the copied keyword list. */ 1548 delete_list (_head); 1549 1550 if (--asso_iteration == 0) 1551 break; 1552 /* Prepare for next iteration. */ 1553 if (_initial_asso_value >= 2) 1554 _initial_asso_value -= 2, _jump += 2; 1555 else 1556 _initial_asso_value += _jump, _jump = 1; 1557 } 1558 _head = saved_head; 1559 /* Install the best found asso_values. */ 1560 _initial_asso_value = best_initial_asso_value; 1561 _jump = best_jump; 1562 memcpy (_asso_values, best_asso_values, 1563 _alpha_size * sizeof (_asso_values[0])); 1564 delete[] best_asso_values; 1565 /* The keywords' _hash_value fields are recomputed below. */ 1566 } 1567} 1568 1569/* ========================================================================= */ 1570 1571/* Comparison function for sorting by increasing _hash_value. */ 1572static bool 1573less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2) 1574{ 1575 return keyword1->_hash_value < keyword2->_hash_value; 1576} 1577 1578/* Sorts the keyword list by hash value. */ 1579 1580void 1581Search::sort () 1582{ 1583 _head = mergesort_list (_head, less_by_hash_value); 1584} 1585 1586void 1587Search::optimize () 1588{ 1589 /* Preparations. */ 1590 prepare (); 1591 1592 /* Step 1: Finding good byte positions. */ 1593 find_positions (); 1594 1595 /* Step 2: Finding good alpha increments. */ 1596 find_alpha_inc (); 1597 1598 /* Step 3: Finding good asso_values. */ 1599 find_good_asso_values (); 1600 1601 /* Make one final check, just to make sure nothing weird happened.... */ 1602 _collision_detector->clear (); 1603 for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest()) 1604 { 1605 KeywordExt *curr = curr_ptr->first(); 1606 unsigned int hashcode = compute_hash (curr); 1607 if (_collision_detector->set_bit (hashcode)) 1608 { 1609 /* This shouldn't happen. proj1, proj2, proj3 must have been 1610 computed to be injective on the given keyword set. */ 1611 fprintf (stderr, 1612 "\nInternal error, unexpected duplicate hash code\n"); 1613 if (option[POSITIONS]) 1614 fprintf (stderr, "try options -m or -r, or use new key positions.\n\n"); 1615 else 1616 fprintf (stderr, "try options -m or -r.\n\n"); 1617 exit (1); 1618 } 1619 } 1620 1621 /* Sorts the keyword list by hash value. */ 1622 sort (); 1623 1624 /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely 1625 necessary, but speeds up the lookup function in many cases of lookup 1626 failure: no string comparison is needed once the hash value of a string 1627 is larger than the hash value of any keyword. */ 1628 int max_hash_value; 1629 { 1630 KeywordExt_List *temp; 1631 for (temp = _head; temp->rest(); temp = temp->rest()) 1632 ; 1633 max_hash_value = temp->first()->_hash_value; 1634 } 1635 for (unsigned int c = 0; c < _alpha_size; c++) 1636 if (_occurrences[c] == 0) 1637 _asso_values[c] = max_hash_value + 1; 1638 1639 /* Propagate unified asso_values. */ 1640 if (_alpha_unify) 1641 for (unsigned int c = 0; c < _alpha_size; c++) 1642 if (_alpha_unify[c] != c) 1643 _asso_values[c] = _asso_values[_alpha_unify[c]]; 1644} 1645 1646/* Prints out some diagnostics upon completion. */ 1647 1648Search::~Search () 1649{ 1650 delete _collision_detector; 1651 if (option[DEBUG]) 1652 { 1653 fprintf (stderr, "\ndumping occurrence and associated values tables\n"); 1654 1655 for (unsigned int i = 0; i < _alpha_size; i++) 1656 if (_occurrences[i]) 1657 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", 1658 i, _asso_values[i], i, _occurrences[i]); 1659 1660 fprintf (stderr, "end table dumping\n"); 1661 1662 fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d" 1663 "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", 1664 _list_len, _total_keys, _total_duplicates, _max_key_len); 1665 1666 int field_width = _max_selchars_length; 1667 fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", 1668 field_width, "selchars"); 1669 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) 1670 { 1671 fprintf (stderr, "%11d,%11d,%6d, ", 1672 ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index); 1673 if (field_width > ptr->first()->_selchars_length) 1674 fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, ""); 1675 for (int j = 0; j < ptr->first()->_selchars_length; j++) 1676 putc (ptr->first()->_selchars[j], stderr); 1677 fprintf (stderr, ", %.*s\n", 1678 ptr->first()->_allchars_length, ptr->first()->_allchars); 1679 } 1680 1681 fprintf (stderr, "End dumping list.\n\n"); 1682 } 1683 delete[] _asso_values; 1684 delete[] _occurrences; 1685 delete[] _alpha_unify; 1686 delete[] _alpha_inc; 1687} 1688