1/* 2 * array.c - routines for associative arrays. 3 */ 4 5/* 6 * Copyright (C) 1986, 1988, 1989, 1991-2003 the Free Software Foundation, Inc. 7 * 8 * This file is part of GAWK, the GNU implementation of the 9 * AWK Programming Language. 10 * 11 * GAWK is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2 of the License, or 14 * (at your option) any later version. 15 * 16 * GAWK is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 24 */ 25 26/* 27 * Tree walks (``for (iggy in foo)'') and array deletions use expensive 28 * linear searching. So what we do is start out with small arrays and 29 * grow them as needed, so that our arrays are hopefully small enough, 30 * most of the time, that they're pretty full and we're not looking at 31 * wasted space. 32 * 33 * The decision is made to grow the array if the average chain length is 34 * ``too big''. This is defined as the total number of entries in the table 35 * divided by the size of the array being greater than some constant. 36 * 37 * 11/2002: We make the constant a variable, so that it can be tweaked 38 * via environment variable. 39 */ 40 41static int AVG_CHAIN_MAX = 2; /* 11/2002: Modern machines are bigger, cut this down from 10. */ 42 43#include "awk.h" 44 45static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1)); 46static void grow_table P((NODE *symbol)); 47 48static unsigned long gst_hash_string P((const char *str, size_t len, unsigned long hsize)); 49static unsigned long scramble P((unsigned long x)); 50static unsigned long awk_hash P((const char *s, size_t len, unsigned long hsize)); 51 52unsigned long (*hash)P((const char *s, size_t len, unsigned long hsize)) = awk_hash; 53 54/* array_init --- possibly temporary function for experimentation purposes */ 55 56void 57array_init() 58{ 59 const char *val; 60 int newval; 61 62 if ((val = getenv("AVG_CHAIN_MAX")) != NULL && ISDIGIT(*val)) { 63 for (newval = 0; *val && ISDIGIT(*val); val++) 64 newval = (newval * 10) + *val - '0'; 65 66 AVG_CHAIN_MAX = newval; 67 } 68 69 if ((val = getenv("AWK_HASH")) != NULL && strcmp(val, "gst") == 0) 70 hash = gst_hash_string; 71} 72 73/* 74 * get_actual --- proceed to the actual Node_var_array, 75 * change Node_var_new to an array. 76 * If canfatal and type isn't good, die fatally, 77 * otherwise return the final actual value. 78 */ 79 80NODE * 81get_actual(NODE *symbol, int canfatal) 82{ 83 int isparam = (symbol->type == Node_param_list 84 && (symbol->flags & FUNC) == 0); 85 NODE *save_symbol = symbol; 86 87 if (isparam) { 88 save_symbol = symbol = stack_ptr[symbol->param_cnt]; 89 if (symbol->type == Node_array_ref) 90 symbol = symbol->orig_array; 91 } 92 93 switch (symbol->type) { 94 case Node_var_new: 95 symbol->type = Node_var_array; 96 symbol->var_array = NULL; 97 /* fall through */ 98 case Node_var_array: 99 break; 100 101 case Node_array_ref: 102 case Node_param_list: 103 if (canfatal) 104 cant_happen(); 105 /* else 106 fall through */ 107 108 default: 109 /* notably Node_var but catches also e.g. FS[1] = "x" */ 110 if (canfatal) 111 fatal(isparam ? 112 _("attempt to use scalar parameter `%s' as an array") : 113 _("attempt to use scalar `%s' as array"), 114 save_symbol->vname); 115 else 116 break; 117 } 118 119 return symbol; 120} 121 122/* 123 * array_vname --- print the name of the array 124 * 125 * Returns a pointer to a statically maintained dynamically allocated string. 126 * It's appropriate for printing the name once; if the caller wants 127 * to save it, they have to make a copy. 128 * 129 * Setting MAX_LEN to a positive value (eg. 140) would limit the length 130 * of the output to _roughly_ that length. 131 * 132 * If MAX_LEN == 0, which is the default, the whole stack is printed. 133 */ 134#define MAX_LEN 0 135 136char * 137array_vname(register const NODE *symbol) 138{ 139 if (symbol->type == Node_param_list) 140 symbol = stack_ptr[symbol->param_cnt]; 141 142 if (symbol->type != Node_array_ref || symbol->orig_array->type != Node_var_array) 143 return symbol->vname; 144 else { 145 static char *message = NULL; 146 static size_t msglen = 0; 147 char *s; 148 size_t len; 149 int n; 150 const NODE *save_symbol = symbol; 151 const char *from = _("from %s"); 152 153#if (MAX_LEN <= 0) || !defined(HAVE_SNPRINTF) 154 /* This is the default branch. */ 155 156 /* First, we have to compute the length of the string: */ 157 len = strlen(symbol->vname) + 2; /* "%s (" */ 158 n = 0; 159 do { 160 symbol = symbol->prev_array; 161 len += strlen(symbol->vname); 162 n++; 163 } while (symbol->type == Node_array_ref); 164 /* 165 * Each node contributes by strlen(from) minus the length 166 * of "%s" in the translation (which is at least 2) 167 * plus 2 for ", " or ")\0"; this adds up to strlen(from). 168 */ 169 len += n * strlen(from); 170 171 /* (Re)allocate memory: */ 172 if (message == NULL) { 173 emalloc(message, char *, len, "array_vname"); 174 msglen = len; 175 } else if (len > msglen) { 176 erealloc(message, char *, len, "array_vname"); 177 msglen = len; 178 } /* else 179 current buffer can hold new name */ 180 181 /* We're ready to print: */ 182 symbol = save_symbol; 183 s = message; 184 /* 185 * Ancient systems have sprintf() returning char *, not int. 186 * Thus, `s += sprintf(s, from, name);' is a no-no. 187 */ 188 sprintf(s, "%s (", symbol->vname); 189 s += strlen(s); 190 for (;;) { 191 symbol = symbol->prev_array; 192 sprintf(s, from, symbol->vname); 193 s += strlen(s); 194 if (symbol->type != Node_array_ref) 195 break; 196 sprintf(s, ", "); 197 s += strlen(s); 198 } 199 sprintf(s, ")"); 200 201#else /* MAX_LEN > 0 */ 202 203 /* 204 * The following check fails only on 205 * abnormally_long_variable_name. 206 */ 207#define PRINT_CHECK \ 208 if (n <= 0 || n >= len) \ 209 return save_symbol->vname; \ 210 s += n; len -= n 211#define PRINT(str) \ 212 n = snprintf(s, len, str); \ 213 PRINT_CHECK 214#define PRINT_vname(str) \ 215 n = snprintf(s, len, str, symbol->vname); \ 216 PRINT_CHECK 217 218 if (message == NULL) 219 emalloc(message, char *, MAX_LEN, "array_vname"); 220 221 s = message; 222 len = MAX_LEN; 223 224 /* First, print the vname of the node. */ 225 PRINT_vname("%s ("); 226 227 for (;;) { 228 symbol = symbol->prev_array; 229 /* 230 * When we don't have enough space and this is not 231 * the last node, shorten the list. 232 */ 233 if (len < 40 && symbol->type == Node_array_ref) { 234 PRINT("..., "); 235 symbol = symbol->orig_array; 236 } 237 PRINT_vname(from); 238 if (symbol->type != Node_array_ref) 239 break; 240 PRINT(", "); 241 } 242 PRINT(")"); 243 244#undef PRINT_CHECK 245#undef PRINT 246#undef PRINT_vname 247#endif /* MAX_LEN <= 0 */ 248 249 return message; 250 } 251} 252#undef MAX_LEN 253 254/* concat_exp --- concatenate expression list into a single string */ 255 256NODE * 257concat_exp(register NODE *tree) 258{ 259 register NODE *r; 260 char *str; 261 char *s; 262 size_t len; 263 int offset; 264 size_t subseplen; 265 const char *subsep; 266 267 if (tree->type != Node_expression_list) 268 return force_string(tree_eval(tree)); 269 r = force_string(tree_eval(tree->lnode)); 270 if (tree->rnode == NULL) 271 return r; 272 subseplen = SUBSEP_node->var_value->stlen; 273 subsep = SUBSEP_node->var_value->stptr; 274 len = r->stlen + subseplen + 2; 275 emalloc(str, char *, len, "concat_exp"); 276 memcpy(str, r->stptr, r->stlen+1); 277 s = str + r->stlen; 278 free_temp(r); 279 for (tree = tree->rnode; tree != NULL; tree = tree->rnode) { 280 if (subseplen == 1) 281 *s++ = *subsep; 282 else { 283 memcpy(s, subsep, subseplen+1); 284 s += subseplen; 285 } 286 r = force_string(tree_eval(tree->lnode)); 287 len += r->stlen + subseplen; 288 offset = s - str; 289 erealloc(str, char *, len, "concat_exp"); 290 s = str + offset; 291 memcpy(s, r->stptr, r->stlen+1); 292 s += r->stlen; 293 free_temp(r); 294 } 295 r = make_str_node(str, s - str, ALREADY_MALLOCED); 296 r->flags |= TEMP; 297 return r; 298} 299 300/* assoc_clear --- flush all the values in symbol[] before doing a split() */ 301 302void 303assoc_clear(NODE *symbol) 304{ 305 int i; 306 NODE *bucket, *next; 307 308 if (symbol->var_array == NULL) 309 return; 310 for (i = 0; i < symbol->array_size; i++) { 311 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) { 312 next = bucket->ahnext; 313 unref(bucket->ahvalue); 314 unref(bucket); /* unref() will free the ahname_str */ 315 } 316 symbol->var_array[i] = NULL; 317 } 318 free(symbol->var_array); 319 symbol->var_array = NULL; 320 symbol->array_size = symbol->table_size = 0; 321 symbol->flags &= ~ARRAYMAXED; 322} 323 324/* hash --- calculate the hash function of the string in subs */ 325 326static unsigned long 327awk_hash(register const char *s, register size_t len, unsigned long hsize) 328{ 329 register unsigned long h = 0; 330 331 /* 332 * This is INCREDIBLY ugly, but fast. We break the string up into 333 * 8 byte units. On the first time through the loop we get the 334 * "leftover bytes" (strlen % 8). On every other iteration, we 335 * perform 8 HASHC's so we handle all 8 bytes. Essentially, this 336 * saves us 7 cmp & branch instructions. If this routine is 337 * heavily used enough, it's worth the ugly coding. 338 * 339 * OZ's original sdbm hash, copied from Margo Seltzers db package. 340 */ 341 342 /* 343 * Even more speed: 344 * #define HASHC h = *s++ + 65599 * h 345 * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts 346 */ 347#define HASHC htmp = (h << 6); \ 348 h = *s++ + htmp + (htmp << 10) - h 349 350 unsigned long htmp; 351 352 h = 0; 353 354#if defined(VAXC) 355 /* 356 * This was an implementation of "Duff's Device", but it has been 357 * redone, separating the switch for extra iterations from the 358 * loop. This is necessary because the DEC VAX-C compiler is 359 * STOOPID. 360 */ 361 switch (len & (8 - 1)) { 362 case 7: HASHC; 363 case 6: HASHC; 364 case 5: HASHC; 365 case 4: HASHC; 366 case 3: HASHC; 367 case 2: HASHC; 368 case 1: HASHC; 369 default: break; 370 } 371 372 if (len > (8 - 1)) { 373 register size_t loop = len >> 3; 374 do { 375 HASHC; 376 HASHC; 377 HASHC; 378 HASHC; 379 HASHC; 380 HASHC; 381 HASHC; 382 HASHC; 383 } while (--loop); 384 } 385#else /* ! VAXC */ 386 /* "Duff's Device" for those who can handle it */ 387 if (len > 0) { 388 register size_t loop = (len + 8 - 1) >> 3; 389 390 switch (len & (8 - 1)) { 391 case 0: 392 do { /* All fall throughs */ 393 HASHC; 394 case 7: HASHC; 395 case 6: HASHC; 396 case 5: HASHC; 397 case 4: HASHC; 398 case 3: HASHC; 399 case 2: HASHC; 400 case 1: HASHC; 401 } while (--loop); 402 } 403 } 404#endif /* ! VAXC */ 405 406 if (h >= hsize) 407 h %= hsize; 408 return h; 409} 410 411/* assoc_find --- locate symbol[subs] */ 412 413static NODE * /* NULL if not found */ 414assoc_find(NODE *symbol, register NODE *subs, int hash1) 415{ 416 register NODE *bucket; 417 const char *s1_str; 418 size_t s1_len; 419 NODE *s2; 420 421 for (bucket = symbol->var_array[hash1]; bucket != NULL; 422 bucket = bucket->ahnext) { 423 /* 424 * This used to use cmp_nodes() here. That's wrong. 425 * Array indexes are strings; compare as such, always! 426 */ 427 s1_str = bucket->ahname_str; 428 s1_len = bucket->ahname_len; 429 s2 = subs; 430 431 if (s1_len == s2->stlen) { 432 if (s1_len == 0 /* "" is a valid index */ 433 || STREQN(s1_str, s2->stptr, s1_len)) 434 return bucket; 435 } 436 } 437 return NULL; 438} 439 440/* in_array --- test whether the array element symbol[subs] exists or not, 441 * return pointer to value if it does. 442 */ 443 444NODE * 445in_array(NODE *symbol, NODE *subs) 446{ 447 register int hash1; 448 NODE *ret; 449 450 symbol = get_array(symbol); 451 452 /* 453 * Evaluate subscript first, it could have side effects. 454 */ 455 subs = concat_exp(subs); /* concat_exp returns a string node */ 456 if (symbol->var_array == NULL) { 457 free_temp(subs); 458 return NULL; 459 } 460 hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size); 461 ret = assoc_find(symbol, subs, hash1); 462 free_temp(subs); 463 if (ret) 464 return ret->ahvalue; 465 else 466 return NULL; 467} 468 469/* 470 * assoc_lookup: 471 * Find SYMBOL[SUBS] in the assoc array. Install it with value "" if it 472 * isn't there. Returns a pointer ala get_lhs to where its value is stored. 473 * 474 * SYMBOL is the address of the node (or other pointer) being dereferenced. 475 * SUBS is a number or string used as the subscript. 476 */ 477 478NODE ** 479assoc_lookup(NODE *symbol, NODE *subs, int reference) 480{ 481 register int hash1; 482 register NODE *bucket; 483 484 assert(symbol->type == Node_var_array); 485 486 (void) force_string(subs); 487 488 if (symbol->var_array == NULL) { 489 symbol->array_size = symbol->table_size = 0; /* sanity */ 490 symbol->flags &= ~ARRAYMAXED; 491 grow_table(symbol); 492 hash1 = hash(subs->stptr, subs->stlen, 493 (unsigned long) symbol->array_size); 494 } else { 495 hash1 = hash(subs->stptr, subs->stlen, 496 (unsigned long) symbol->array_size); 497 bucket = assoc_find(symbol, subs, hash1); 498 if (bucket != NULL) { 499 free_temp(subs); 500 return &(bucket->ahvalue); 501 } 502 } 503 504 if (do_lint && reference) { 505 subs->stptr[subs->stlen] = '\0'; 506 lintwarn(_("reference to uninitialized element `%s[\"%s\"]'"), 507 array_vname(symbol), subs->stptr); 508 } 509 510 /* It's not there, install it. */ 511 if (do_lint && subs->stlen == 0) 512 lintwarn(_("subscript of array `%s' is null string"), 513 array_vname(symbol)); 514 515 /* first see if we would need to grow the array, before installing */ 516 symbol->table_size++; 517 if ((symbol->flags & ARRAYMAXED) == 0 518 && (symbol->table_size / symbol->array_size) > AVG_CHAIN_MAX) { 519 grow_table(symbol); 520 /* have to recompute hash value for new size */ 521 hash1 = hash(subs->stptr, subs->stlen, 522 (unsigned long) symbol->array_size); 523 } 524 525 getnode(bucket); 526 bucket->type = Node_ahash; 527 528 /* 529 * Freeze this string value --- it must never 530 * change, no matter what happens to the value 531 * that created it or to CONVFMT, etc. 532 * 533 * One day: Use an atom table to track array indices, 534 * and avoid the extra memory overhead. 535 */ 536 bucket->flags |= MALLOC; 537 bucket->ahname_ref = 1; 538 emalloc(bucket->ahname_str, char *, subs->stlen + 2, "assoc_lookup"); 539 bucket->ahname_len = subs->stlen; 540 541 memcpy(bucket->ahname_str, subs->stptr, subs->stlen); 542 bucket->ahname_str[bucket->ahname_len] = '\0'; 543 544 free_temp(subs); 545 546 bucket->ahvalue = Nnull_string; 547 bucket->ahnext = symbol->var_array[hash1]; 548 symbol->var_array[hash1] = bucket; 549 return &(bucket->ahvalue); 550} 551 552/* do_delete --- perform `delete array[s]' */ 553 554/* 555 * `symbol' is array 556 * `tree' is subscript 557 */ 558 559void 560do_delete(NODE *sym, NODE *tree) 561{ 562 register int hash1; 563 register NODE *bucket, *last; 564 NODE *subs; 565 register NODE *symbol = get_array(sym); 566 567 if (tree == NULL) { /* delete array */ 568 assoc_clear(symbol); 569 return; 570 } 571 572 last = NULL; /* shut up gcc -Wall */ 573 hash1 = 0; /* ditto */ 574 575 /* 576 * Always evaluate subscript, it could have side effects. 577 */ 578 subs = concat_exp(tree); /* concat_exp returns string node */ 579 580 if (symbol->var_array != NULL) { 581 hash1 = hash(subs->stptr, subs->stlen, 582 (unsigned long) symbol->array_size); 583 last = NULL; 584 for (bucket = symbol->var_array[hash1]; bucket != NULL; 585 last = bucket, bucket = bucket->ahnext) { 586 /* 587 * This used to use cmp_nodes() here. That's wrong. 588 * Array indexes are strings; compare as such, always! 589 */ 590 const char *s1_str; 591 size_t s1_len; 592 NODE *s2; 593 594 s1_str = bucket->ahname_str; 595 s1_len = bucket->ahname_len; 596 s2 = subs; 597 598 if (s1_len == s2->stlen) { 599 if (s1_len == 0 /* "" is a valid index */ 600 || STREQN(s1_str, s2->stptr, s1_len)) 601 break; 602 } 603 } 604 } else 605 bucket = NULL; /* The array is empty. */ 606 607 if (bucket == NULL) { 608 if (do_lint) 609 lintwarn(_("delete: index `%s' not in array `%s'"), 610 subs->stptr, array_vname(sym)); 611 free_temp(subs); 612 return; 613 } 614 615 free_temp(subs); 616 617 if (last != NULL) 618 last->ahnext = bucket->ahnext; 619 else 620 symbol->var_array[hash1] = bucket->ahnext; 621 unref(bucket->ahvalue); 622 unref(bucket); /* unref() will free the ahname_str */ 623 symbol->table_size--; 624 if (symbol->table_size <= 0) { 625 memset(symbol->var_array, '\0', 626 sizeof(NODE *) * symbol->array_size); 627 symbol->table_size = symbol->array_size = 0; 628 symbol->flags &= ~ARRAYMAXED; 629 free((char *) symbol->var_array); 630 symbol->var_array = NULL; 631 } 632} 633 634/* do_delete_loop --- simulate ``for (iggy in foo) delete foo[iggy]'' */ 635 636/* 637 * The primary hassle here is that `iggy' needs to have some arbitrary 638 * array index put in it before we can clear the array, we can't 639 * just replace the loop with `delete foo'. 640 */ 641 642void 643do_delete_loop(NODE *symbol, NODE *tree) 644{ 645 size_t i; 646 NODE **lhs; 647 Func_ptr after_assign = NULL; 648 649 symbol = get_array(symbol); 650 651 if (symbol->var_array == NULL) 652 return; 653 654 /* get first index value */ 655 for (i = 0; i < symbol->array_size; i++) { 656 if (symbol->var_array[i] != NULL) { 657 lhs = get_lhs(tree->lnode, & after_assign, FALSE); 658 unref(*lhs); 659 *lhs = make_string(symbol->var_array[i]->ahname_str, 660 symbol->var_array[i]->ahname_len); 661 if (after_assign) 662 (*after_assign)(); 663 break; 664 } 665 } 666 667 /* blast the array in one shot */ 668 assoc_clear(symbol); 669} 670 671/* grow_table --- grow a hash table */ 672 673static void 674grow_table(NODE *symbol) 675{ 676 NODE **old, **new, *chain, *next; 677 int i, j; 678 unsigned long hash1; 679 unsigned long oldsize, newsize; 680 /* 681 * This is an array of primes. We grow the table by an order of 682 * magnitude each time (not just doubling) so that growing is a 683 * rare operation. We expect, on average, that it won't happen 684 * more than twice. The final size is also chosen to be small 685 * enough so that MS-DOG mallocs can handle it. When things are 686 * very large (> 8K), we just double more or less, instead of 687 * just jumping from 8K to 64K. 688 */ 689 static const long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497, 690#if ! defined(MSDOS) && ! defined(OS2) && ! defined(atarist) 691 131101, 262147, 524309, 1048583, 2097169, 692 4194319, 8388617, 16777259, 33554467, 693 67108879, 134217757, 268435459, 536870923, 694 1073741827 695#endif 696 }; 697 698 /* find next biggest hash size */ 699 newsize = oldsize = symbol->array_size; 700 for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) { 701 if (oldsize < sizes[i]) { 702 newsize = sizes[i]; 703 break; 704 } 705 } 706 707 if (newsize == oldsize) { /* table already at max (!) */ 708 symbol->flags |= ARRAYMAXED; 709 return; 710 } 711 712 /* allocate new table */ 713 emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table"); 714 memset(new, '\0', newsize * sizeof(NODE *)); 715 716 /* brand new hash table, set things up and return */ 717 if (symbol->var_array == NULL) { 718 symbol->table_size = 0; 719 goto done; 720 } 721 722 /* old hash table there, move stuff to new, free old */ 723 old = symbol->var_array; 724 for (i = 0; i < oldsize; i++) { 725 if (old[i] == NULL) 726 continue; 727 728 for (chain = old[i]; chain != NULL; chain = next) { 729 next = chain->ahnext; 730 hash1 = hash(chain->ahname_str, 731 chain->ahname_len, newsize); 732 733 /* remove from old list, add to new */ 734 chain->ahnext = new[hash1]; 735 new[hash1] = chain; 736 } 737 } 738 free(old); 739 740done: 741 /* 742 * note that symbol->table_size does not change if an old array, 743 * and is explicitly set to 0 if a new one. 744 */ 745 symbol->var_array = new; 746 symbol->array_size = newsize; 747} 748 749/* pr_node --- print simple node info */ 750 751static void 752pr_node(NODE *n) 753{ 754 if ((n->flags & (NUMCUR|NUMBER)) != 0) 755 printf("%g", n->numbr); 756 else 757 printf("%.*s", (int) n->stlen, n->stptr); 758} 759 760/* assoc_dump --- dump the contents of an array */ 761 762NODE * 763assoc_dump(NODE *symbol) 764{ 765 int i; 766 NODE *bucket; 767 768 if (symbol->var_array == NULL) { 769 printf(_("%s: empty (null)\n"), symbol->vname); 770 return tmp_number((AWKNUM) 0); 771 } 772 773 if (symbol->table_size == 0) { 774 printf(_("%s: empty (zero)\n"), symbol->vname); 775 return tmp_number((AWKNUM) 0); 776 } 777 778 printf(_("%s: table_size = %d, array_size = %d\n"), symbol->vname, 779 (int) symbol->table_size, (int) symbol->array_size); 780 781 for (i = 0; i < symbol->array_size; i++) { 782 for (bucket = symbol->var_array[i]; bucket != NULL; 783 bucket = bucket->ahnext) { 784 printf("%s: I: [len %d <%.*s>] V: [", 785 symbol->vname, 786 (int) bucket->ahname_len, 787 (int) bucket->ahname_len, 788 bucket->ahname_str); 789 pr_node(bucket->ahvalue); 790 printf("]\n"); 791 } 792 } 793 794 return tmp_number((AWKNUM) 0); 795} 796 797/* do_adump --- dump an array: interface to assoc_dump */ 798 799NODE * 800do_adump(NODE *tree) 801{ 802 NODE *r, *a; 803 804 a = tree->lnode; 805 806 if (a->type == Node_param_list) { 807 printf(_("%s: is parameter\n"), a->vname); 808 a = stack_ptr[a->param_cnt]; 809 } 810 811 if (a->type == Node_array_ref) { 812 printf(_("%s: array_ref to %s\n"), a->vname, 813 a->orig_array->vname); 814 a = a->orig_array; 815 } 816 817 r = assoc_dump(a); 818 819 return r; 820} 821 822/* 823 * The following functions implement the builtin 824 * asort function. Initial work by Alan J. Broder, 825 * ajb@woti.com. 826 */ 827 828/* dup_table --- duplicate input symbol table "symbol" */ 829 830static void 831dup_table(NODE *symbol, NODE *newsymb) 832{ 833 NODE **old, **new, *chain, *bucket; 834 int i; 835 unsigned long cursize; 836 837 /* find the current hash size */ 838 cursize = symbol->array_size; 839 840 new = NULL; 841 842 /* input is a brand new hash table, so there's nothing to copy */ 843 if (symbol->var_array == NULL) 844 newsymb->table_size = 0; 845 else { 846 /* old hash table there, dupnode stuff into a new table */ 847 848 /* allocate new table */ 849 emalloc(new, NODE **, cursize * sizeof(NODE *), "dup_table"); 850 memset(new, '\0', cursize * sizeof(NODE *)); 851 852 /* do the copying/dupnode'ing */ 853 old = symbol->var_array; 854 for (i = 0; i < cursize; i++) { 855 if (old[i] != NULL) { 856 for (chain = old[i]; chain != NULL; 857 chain = chain->ahnext) { 858 /* get a node for the linked list */ 859 getnode(bucket); 860 bucket->type = Node_ahash; 861 bucket->flags |= MALLOC; 862 bucket->ahname_ref = 1; 863 864 /* 865 * copy the corresponding name and 866 * value from the original input list 867 */ 868 emalloc(bucket->ahname_str, char *, chain->ahname_len + 2, "dup_table"); 869 bucket->ahname_len = chain->ahname_len; 870 871 memcpy(bucket->ahname_str, chain->ahname_str, chain->ahname_len); 872 bucket->ahname_str[bucket->ahname_len] = '\0'; 873 874 bucket->ahvalue = dupnode(chain->ahvalue); 875 876 /* 877 * put the node on the corresponding 878 * linked list in the new table 879 */ 880 bucket->ahnext = new[i]; 881 new[i] = bucket; 882 } 883 } 884 } 885 newsymb->table_size = symbol->table_size; 886 } 887 888 newsymb->var_array = new; 889 newsymb->array_size = cursize; 890} 891 892/* merge --- do a merge of two sorted lists */ 893 894static NODE * 895merge(NODE *left, NODE *right) 896{ 897 NODE *ans, *cur; 898 899 /* 900 * The use of cmp_nodes() here means that IGNORECASE influences the 901 * comparison. This is OK, but it may be surprising. This comment 902 * serves to remind us that we know about this and that it's OK. 903 */ 904 if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) { 905 ans = cur = left; 906 left = left->ahnext; 907 } else { 908 ans = cur = right; 909 right = right->ahnext; 910 } 911 912 while (left != NULL && right != NULL) { 913 if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) { 914 cur->ahnext = left; 915 cur = left; 916 left = left->ahnext; 917 } else { 918 cur->ahnext = right; 919 cur = right; 920 right = right->ahnext; 921 } 922 } 923 924 cur->ahnext = (left != NULL ? left : right); 925 926 return ans; 927} 928 929/* merge_sort --- recursively sort the left and right sides of a list */ 930 931static NODE * 932merge_sort(NODE *left, int size) 933{ 934 NODE *right, *tmp; 935 int i, half; 936 937 if (size <= 1) 938 return left; 939 940 /* walk down the list, till just one before the midpoint */ 941 tmp = left; 942 half = size / 2; 943 for (i = 0; i < half-1; i++) 944 tmp = tmp->ahnext; 945 946 /* split the list into two parts */ 947 right = tmp->ahnext; 948 tmp->ahnext = NULL; 949 950 /* sort the left and right parts of the list */ 951 left = merge_sort(left, half); 952 right = merge_sort(right, size-half); 953 954 /* merge the two sorted parts of the list */ 955 return merge(left, right); 956} 957 958 959/* 960 * assoc_from_list -- Populate an array with the contents of a list of NODEs, 961 * using increasing integers as the key. 962 */ 963 964static void 965assoc_from_list(NODE *symbol, NODE *list) 966{ 967 NODE *next; 968 unsigned long i = 0; 969 register int hash1; 970 char buf[100]; 971 972 for (; list != NULL; list = next) { 973 next = list->ahnext; 974 975 /* make an int out of i++ */ 976 i++; 977 sprintf(buf, "%lu", i); 978 assert(list->ahname_str == NULL); 979 assert(list->ahname_ref == 1); 980 emalloc(list->ahname_str, char *, strlen(buf) + 2, "assoc_from_list"); 981 list->ahname_len = strlen(buf); 982 strcpy(list->ahname_str, buf); 983 984 /* find the bucket where it belongs */ 985 hash1 = hash(list->ahname_str, list->ahname_len, 986 symbol->array_size); 987 988 /* link the node into the chain at that bucket */ 989 list->ahnext = symbol->var_array[hash1]; 990 symbol->var_array[hash1] = list; 991 } 992} 993 994/* 995 * assoc_sort_inplace --- sort all the values in symbol[], replacing 996 * the sorted values back into symbol[], indexed by integers starting with 1. 997 */ 998 999typedef enum asort_how { VALUE, INDEX } ASORT_TYPE; 1000 1001static NODE * 1002assoc_sort_inplace(NODE *symbol, ASORT_TYPE how) 1003{ 1004 int i, num; 1005 NODE *bucket, *next, *list; 1006 1007 if (symbol->var_array == NULL 1008 || symbol->array_size <= 0 1009 || symbol->table_size <= 0) 1010 return tmp_number((AWKNUM) 0); 1011 1012 /* build a linked list out of all the entries in the table */ 1013 if (how == VALUE) { 1014 list = NULL; 1015 num = 0; 1016 for (i = 0; i < symbol->array_size; i++) { 1017 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) { 1018 next = bucket->ahnext; 1019 if (bucket->ahname_ref == 1) { 1020 free(bucket->ahname_str); 1021 bucket->ahname_str = NULL; 1022 bucket->ahname_len = 0; 1023 } else { 1024 NODE *r; 1025 1026 getnode(r); 1027 *r = *bucket; 1028 unref(bucket); 1029 bucket = r; 1030 bucket->flags |= MALLOC; 1031 bucket->ahname_ref = 1; 1032 bucket->ahname_str = NULL; 1033 bucket->ahname_len = 0; 1034 } 1035 bucket->ahnext = list; 1036 list = bucket; 1037 num++; 1038 } 1039 symbol->var_array[i] = NULL; 1040 } 1041 } else { /* how == INDEX */ 1042 list = NULL; 1043 num = 0; 1044 for (i = 0; i < symbol->array_size; i++) { 1045 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) { 1046 next = bucket->ahnext; 1047 1048 /* toss old value */ 1049 unref(bucket->ahvalue); 1050 1051 /* move index into value */ 1052 if (bucket->ahname_ref == 1) { 1053 bucket->ahvalue = make_str_node(bucket->ahname_str, 1054 bucket->ahname_len, ALREADY_MALLOCED); 1055 bucket->ahname_str = NULL; 1056 bucket->ahname_len = 0; 1057 } else { 1058 NODE *r; 1059 1060 bucket->ahvalue = make_string(bucket->ahname_str, bucket->ahname_len); 1061 getnode(r); 1062 *r = *bucket; 1063 unref(bucket); 1064 bucket = r; 1065 bucket->flags |= MALLOC; 1066 bucket->ahname_ref = 1; 1067 bucket->ahname_str = NULL; 1068 bucket->ahname_len = 0; 1069 } 1070 1071 bucket->ahnext = list; 1072 list = bucket; 1073 num++; 1074 } 1075 symbol->var_array[i] = NULL; 1076 } 1077 } 1078 1079 /* 1080 * Sort the linked list of NODEs. 1081 * (The especially nice thing about using a merge sort here is that 1082 * we require absolutely no additional storage. This is handy if the 1083 * array has grown to be very large.) 1084 */ 1085 list = merge_sort(list, num); 1086 1087 /* 1088 * now repopulate the original array, using increasing 1089 * integers as the key 1090 */ 1091 assoc_from_list(symbol, list); 1092 1093 return tmp_number((AWKNUM) num); 1094} 1095 1096/* asort_actual --- do the actual work to sort the input array */ 1097 1098static NODE * 1099asort_actual(NODE *tree, ASORT_TYPE how) 1100{ 1101 NODE *array = get_array(tree->lnode); 1102 1103 if (tree->rnode != NULL) { /* 2nd optional arg */ 1104 NODE *dest = get_array(tree->rnode->lnode); 1105 1106 assoc_clear(dest); 1107 dup_table(array, dest); 1108 array = dest; 1109 } 1110 1111 return assoc_sort_inplace(array, how); 1112} 1113 1114/* do_asort --- sort array by value */ 1115 1116NODE * 1117do_asort(NODE *tree) 1118{ 1119 return asort_actual(tree, VALUE); 1120} 1121 1122/* do_asorti --- sort array by index */ 1123 1124NODE * 1125do_asorti(NODE *tree) 1126{ 1127 return asort_actual(tree, INDEX); 1128} 1129 1130/* 1131From bonzini@gnu.org Mon Oct 28 16:05:26 2002 1132Date: Mon, 28 Oct 2002 13:33:03 +0100 1133From: Paolo Bonzini <bonzini@gnu.org> 1134To: arnold@skeeve.com 1135Subject: Hash function 1136Message-ID: <20021028123303.GA6832@biancaneve> 1137 1138Here is the hash function I'm using in GNU Smalltalk. The scrambling is 1139needed if you use powers of two as the table sizes. If you use primes it 1140is not needed. 1141 1142To use double-hashing with power-of-two size, you should use the 1143_gst_hash_string(str, len) as the primary hash and 1144scramble(_gst_hash_string (str, len)) | 1 as the secondary hash. 1145 1146Paolo 1147 1148*/ 1149/* 1150 * ADR: Slightly modified to work w/in the context of gawk. 1151 */ 1152 1153static unsigned long 1154gst_hash_string(const char *str, size_t len, unsigned long hsize) 1155{ 1156 unsigned long hashVal = 1497032417; /* arbitrary value */ 1157 unsigned long ret; 1158 1159 while (len--) { 1160 hashVal += *str++; 1161 hashVal += (hashVal << 10); 1162 hashVal ^= (hashVal >> 6); 1163 } 1164 1165 ret = scramble(hashVal); 1166 if (ret >= hsize) 1167 ret %= hsize; 1168 1169 return ret; 1170} 1171 1172static unsigned long 1173scramble(unsigned long x) 1174{ 1175 if (sizeof(long) == 4) { 1176 int y = ~x; 1177 1178 x += (y << 10) | (y >> 22); 1179 x += (x << 6) | (x >> 26); 1180 x -= (x << 16) | (x >> 16); 1181 } else { 1182 x ^= (~x) >> 31; 1183 x += (x << 21) | (x >> 11); 1184 x += (x << 5) | (x >> 27); 1185 x += (x << 27) | (x >> 5); 1186 x += (x << 31); 1187 } 1188 1189 return x; 1190} 1191