1/*
2 * array.c - routines for associative arrays.
3 */
4
5/*
6 * Copyright (C) 1986, 1988, 1989, 1991-2003 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
24 */
25
26/*
27 * Tree walks (``for (iggy in foo)'') and array deletions use expensive
28 * linear searching.  So what we do is start out with small arrays and
29 * grow them as needed, so that our arrays are hopefully small enough,
30 * most of the time, that they're pretty full and we're not looking at
31 * wasted space.
32 *
33 * The decision is made to grow the array if the average chain length is
34 * ``too big''. This is defined as the total number of entries in the table
35 * divided by the size of the array being greater than some constant.
36 *
37 * 11/2002: We make the constant a variable, so that it can be tweaked
38 * via environment variable.
39 */
40
41static int AVG_CHAIN_MAX = 2;	/* 11/2002: Modern machines are bigger, cut this down from 10. */
42
43#include "awk.h"
44
45static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1));
46static void grow_table P((NODE *symbol));
47
48static unsigned long gst_hash_string P((const char *str, size_t len, unsigned long hsize));
49static unsigned long scramble P((unsigned long x));
50static unsigned long awk_hash P((const char *s, size_t len, unsigned long hsize));
51
52unsigned long (*hash)P((const char *s, size_t len, unsigned long hsize)) = awk_hash;
53
54/* array_init --- possibly temporary function for experimentation purposes */
55
56void
57array_init()
58{
59	const char *val;
60	int newval;
61
62	if ((val = getenv("AVG_CHAIN_MAX")) != NULL && ISDIGIT(*val)) {
63		for (newval = 0; *val && ISDIGIT(*val); val++)
64			newval = (newval * 10) + *val - '0';
65
66		AVG_CHAIN_MAX = newval;
67	}
68
69	if ((val = getenv("AWK_HASH")) != NULL && strcmp(val, "gst") == 0)
70		hash = gst_hash_string;
71}
72
73/*
74 * get_actual --- proceed to the actual Node_var_array,
75 *	change Node_var_new to an array.
76 *	If canfatal and type isn't good, die fatally,
77 *	otherwise return the final actual value.
78 */
79
80NODE *
81get_actual(NODE *symbol, int canfatal)
82{
83	int isparam = (symbol->type == Node_param_list
84			&& (symbol->flags & FUNC) == 0);
85	NODE *save_symbol = symbol;
86
87	if (isparam) {
88		save_symbol = symbol = stack_ptr[symbol->param_cnt];
89		if (symbol->type == Node_array_ref)
90			symbol = symbol->orig_array;
91	}
92
93	switch (symbol->type) {
94	case Node_var_new:
95		symbol->type = Node_var_array;
96		symbol->var_array = NULL;
97		/* fall through */
98	case Node_var_array:
99		break;
100
101	case Node_array_ref:
102	case Node_param_list:
103		if (canfatal)
104			cant_happen();
105		/* else
106			fall through */
107
108	default:
109		/* notably Node_var but catches also e.g. FS[1] = "x" */
110		if (canfatal)
111			fatal(isparam ?
112				_("attempt to use scalar parameter `%s' as an array") :
113				_("attempt to use scalar `%s' as array"),
114								save_symbol->vname);
115		else
116			break;
117	}
118
119	return symbol;
120}
121
122/*
123 * array_vname --- print the name of the array
124 *
125 * Returns a pointer to a statically maintained dynamically allocated string.
126 * It's appropriate for printing the name once; if the caller wants
127 * to save it, they have to make a copy.
128 *
129 * Setting MAX_LEN to a positive value (eg. 140) would limit the length
130 * of the output to _roughly_ that length.
131 *
132 * If MAX_LEN == 0, which is the default, the whole stack is printed.
133 */
134#define	MAX_LEN 0
135
136char *
137array_vname(register const NODE *symbol)
138{
139	if (symbol->type == Node_param_list)
140		symbol = stack_ptr[symbol->param_cnt];
141
142	if (symbol->type != Node_array_ref || symbol->orig_array->type != Node_var_array)
143		return symbol->vname;
144	else {
145		static char *message = NULL;
146		static size_t msglen = 0;
147		char *s;
148		size_t len;
149		int n;
150		const NODE *save_symbol = symbol;
151		const char *from = _("from %s");
152
153#if (MAX_LEN <= 0) || !defined(HAVE_SNPRINTF)
154		/* This is the default branch. */
155
156		/* First, we have to compute the length of the string: */
157		len = strlen(symbol->vname) + 2;	/* "%s (" */
158		n = 0;
159		do {
160			symbol = symbol->prev_array;
161			len += strlen(symbol->vname);
162			n++;
163		} while	(symbol->type == Node_array_ref);
164		/*
165		 * Each node contributes by strlen(from) minus the length
166		 * of "%s" in the translation (which is at least 2)
167		 * plus 2 for ", " or ")\0"; this adds up to strlen(from).
168		 */
169		len += n * strlen(from);
170
171		/* (Re)allocate memory: */
172		if (message == NULL) {
173			emalloc(message, char *, len, "array_vname");
174			msglen = len;
175		} else if (len > msglen) {
176			erealloc(message, char *, len, "array_vname");
177			msglen = len;
178		} /* else
179			current buffer can hold new name */
180
181		/* We're ready to print: */
182		symbol = save_symbol;
183		s = message;
184		/*
185		 * Ancient systems have sprintf() returning char *, not int.
186		 * Thus, `s += sprintf(s, from, name);' is a no-no.
187		 */
188		sprintf(s, "%s (", symbol->vname);
189		s += strlen(s);
190		for (;;) {
191			symbol = symbol->prev_array;
192			sprintf(s, from, symbol->vname);
193			s += strlen(s);
194			if (symbol->type != Node_array_ref)
195				break;
196			sprintf(s, ", ");
197			s += strlen(s);
198		}
199		sprintf(s, ")");
200
201#else /* MAX_LEN > 0 */
202
203		/*
204		 * The following check fails only on
205		 * abnormally_long_variable_name.
206		 */
207#define PRINT_CHECK \
208		if (n <= 0 || n >= len) \
209			return save_symbol->vname; \
210		s += n; len -= n
211#define PRINT(str) \
212		n = snprintf(s, len, str); \
213		PRINT_CHECK
214#define PRINT_vname(str) \
215		n = snprintf(s, len, str, symbol->vname); \
216		PRINT_CHECK
217
218		if (message == NULL)
219			emalloc(message, char *, MAX_LEN, "array_vname");
220
221		s = message;
222		len = MAX_LEN;
223
224		/* First, print the vname of the node. */
225		PRINT_vname("%s (");
226
227		for (;;) {
228			symbol = symbol->prev_array;
229			/*
230			 * When we don't have enough space and this is not
231			 * the last node, shorten the list.
232			 */
233			if (len < 40 && symbol->type == Node_array_ref) {
234				PRINT("..., ");
235				symbol = symbol->orig_array;
236			}
237			PRINT_vname(from);
238			if (symbol->type != Node_array_ref)
239				break;
240			PRINT(", ");
241		}
242		PRINT(")");
243
244#undef PRINT_CHECK
245#undef PRINT
246#undef PRINT_vname
247#endif /* MAX_LEN <= 0 */
248
249		return message;
250	}
251}
252#undef MAX_LEN
253
254/* concat_exp --- concatenate expression list into a single string */
255
256NODE *
257concat_exp(register NODE *tree)
258{
259	register NODE *r;
260	char *str;
261	char *s;
262	size_t len;
263	int offset;
264	size_t subseplen;
265	const char *subsep;
266
267	if (tree->type != Node_expression_list)
268		return force_string(tree_eval(tree));
269	r = force_string(tree_eval(tree->lnode));
270	if (tree->rnode == NULL)
271		return r;
272	subseplen = SUBSEP_node->var_value->stlen;
273	subsep = SUBSEP_node->var_value->stptr;
274	len = r->stlen + subseplen + 2;
275	emalloc(str, char *, len, "concat_exp");
276	memcpy(str, r->stptr, r->stlen+1);
277	s = str + r->stlen;
278	free_temp(r);
279	for (tree = tree->rnode; tree != NULL; tree = tree->rnode) {
280		if (subseplen == 1)
281			*s++ = *subsep;
282		else {
283			memcpy(s, subsep, subseplen+1);
284			s += subseplen;
285		}
286		r = force_string(tree_eval(tree->lnode));
287		len += r->stlen + subseplen;
288		offset = s - str;
289		erealloc(str, char *, len, "concat_exp");
290		s = str + offset;
291		memcpy(s, r->stptr, r->stlen+1);
292		s += r->stlen;
293		free_temp(r);
294	}
295	r = make_str_node(str, s - str, ALREADY_MALLOCED);
296	r->flags |= TEMP;
297	return r;
298}
299
300/* assoc_clear --- flush all the values in symbol[] before doing a split() */
301
302void
303assoc_clear(NODE *symbol)
304{
305	int i;
306	NODE *bucket, *next;
307
308	if (symbol->var_array == NULL)
309		return;
310	for (i = 0; i < symbol->array_size; i++) {
311		for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
312			next = bucket->ahnext;
313			unref(bucket->ahvalue);
314			unref(bucket);	/* unref() will free the ahname_str */
315		}
316		symbol->var_array[i] = NULL;
317	}
318	free(symbol->var_array);
319	symbol->var_array = NULL;
320	symbol->array_size = symbol->table_size = 0;
321	symbol->flags &= ~ARRAYMAXED;
322}
323
324/* hash --- calculate the hash function of the string in subs */
325
326static unsigned long
327awk_hash(register const char *s, register size_t len, unsigned long hsize)
328{
329	register unsigned long h = 0;
330
331	/*
332	 * This is INCREDIBLY ugly, but fast.  We break the string up into
333	 * 8 byte units.  On the first time through the loop we get the
334	 * "leftover bytes" (strlen % 8).  On every other iteration, we
335	 * perform 8 HASHC's so we handle all 8 bytes.  Essentially, this
336	 * saves us 7 cmp & branch instructions.  If this routine is
337	 * heavily used enough, it's worth the ugly coding.
338	 *
339	 * OZ's original sdbm hash, copied from Margo Seltzers db package.
340	 */
341
342	/*
343	 * Even more speed:
344	 * #define HASHC   h = *s++ + 65599 * h
345	 * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts
346	 */
347#define HASHC   htmp = (h << 6);  \
348		h = *s++ + htmp + (htmp << 10) - h
349
350	unsigned long htmp;
351
352	h = 0;
353
354#if defined(VAXC)
355	/*
356	 * This was an implementation of "Duff's Device", but it has been
357	 * redone, separating the switch for extra iterations from the
358	 * loop. This is necessary because the DEC VAX-C compiler is
359	 * STOOPID.
360	 */
361	switch (len & (8 - 1)) {
362	case 7:		HASHC;
363	case 6:		HASHC;
364	case 5:		HASHC;
365	case 4:		HASHC;
366	case 3:		HASHC;
367	case 2:		HASHC;
368	case 1:		HASHC;
369	default:	break;
370	}
371
372	if (len > (8 - 1)) {
373		register size_t loop = len >> 3;
374		do {
375			HASHC;
376			HASHC;
377			HASHC;
378			HASHC;
379			HASHC;
380			HASHC;
381			HASHC;
382			HASHC;
383		} while (--loop);
384	}
385#else /* ! VAXC */
386	/* "Duff's Device" for those who can handle it */
387	if (len > 0) {
388		register size_t loop = (len + 8 - 1) >> 3;
389
390		switch (len & (8 - 1)) {
391		case 0:
392			do {	/* All fall throughs */
393				HASHC;
394		case 7:		HASHC;
395		case 6:		HASHC;
396		case 5:		HASHC;
397		case 4:		HASHC;
398		case 3:		HASHC;
399		case 2:		HASHC;
400		case 1:		HASHC;
401			} while (--loop);
402		}
403	}
404#endif /* ! VAXC */
405
406	if (h >= hsize)
407		h %= hsize;
408	return h;
409}
410
411/* assoc_find --- locate symbol[subs] */
412
413static NODE *				/* NULL if not found */
414assoc_find(NODE *symbol, register NODE *subs, int hash1)
415{
416	register NODE *bucket;
417	const char *s1_str;
418	size_t s1_len;
419	NODE *s2;
420
421	for (bucket = symbol->var_array[hash1]; bucket != NULL;
422			bucket = bucket->ahnext) {
423		/*
424		 * This used to use cmp_nodes() here.  That's wrong.
425		 * Array indexes are strings; compare as such, always!
426		 */
427		s1_str = bucket->ahname_str;
428		s1_len = bucket->ahname_len;
429		s2 = subs;
430
431		if (s1_len == s2->stlen) {
432			if (s1_len == 0		/* "" is a valid index */
433			    || STREQN(s1_str, s2->stptr, s1_len))
434				return bucket;
435		}
436	}
437	return NULL;
438}
439
440/* in_array --- test whether the array element symbol[subs] exists or not,
441 * 		return pointer to value if it does.
442 */
443
444NODE *
445in_array(NODE *symbol, NODE *subs)
446{
447	register int hash1;
448	NODE *ret;
449
450	symbol = get_array(symbol);
451
452	/*
453	 * Evaluate subscript first, it could have side effects.
454	 */
455	subs = concat_exp(subs);	/* concat_exp returns a string node */
456	if (symbol->var_array == NULL) {
457		free_temp(subs);
458		return NULL;
459	}
460	hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size);
461	ret = assoc_find(symbol, subs, hash1);
462	free_temp(subs);
463	if (ret)
464		return ret->ahvalue;
465	else
466		return NULL;
467}
468
469/*
470 * assoc_lookup:
471 * Find SYMBOL[SUBS] in the assoc array.  Install it with value "" if it
472 * isn't there. Returns a pointer ala get_lhs to where its value is stored.
473 *
474 * SYMBOL is the address of the node (or other pointer) being dereferenced.
475 * SUBS is a number or string used as the subscript.
476 */
477
478NODE **
479assoc_lookup(NODE *symbol, NODE *subs, int reference)
480{
481	register int hash1;
482	register NODE *bucket;
483
484	assert(symbol->type == Node_var_array);
485
486	(void) force_string(subs);
487
488	if (symbol->var_array == NULL) {
489		symbol->array_size = symbol->table_size = 0;	/* sanity */
490		symbol->flags &= ~ARRAYMAXED;
491		grow_table(symbol);
492		hash1 = hash(subs->stptr, subs->stlen,
493				(unsigned long) symbol->array_size);
494	} else {
495		hash1 = hash(subs->stptr, subs->stlen,
496				(unsigned long) symbol->array_size);
497		bucket = assoc_find(symbol, subs, hash1);
498		if (bucket != NULL) {
499			free_temp(subs);
500			return &(bucket->ahvalue);
501		}
502	}
503
504	if (do_lint && reference) {
505		subs->stptr[subs->stlen] = '\0';
506		lintwarn(_("reference to uninitialized element `%s[\"%s\"]'"),
507		      array_vname(symbol), subs->stptr);
508	}
509
510	/* It's not there, install it. */
511	if (do_lint && subs->stlen == 0)
512		lintwarn(_("subscript of array `%s' is null string"),
513			array_vname(symbol));
514
515	/* first see if we would need to grow the array, before installing */
516	symbol->table_size++;
517	if ((symbol->flags & ARRAYMAXED) == 0
518	    && (symbol->table_size / symbol->array_size) > AVG_CHAIN_MAX) {
519		grow_table(symbol);
520		/* have to recompute hash value for new size */
521		hash1 = hash(subs->stptr, subs->stlen,
522				(unsigned long) symbol->array_size);
523	}
524
525	getnode(bucket);
526	bucket->type = Node_ahash;
527
528	/*
529	 * Freeze this string value --- it must never
530	 * change, no matter what happens to the value
531	 * that created it or to CONVFMT, etc.
532	 *
533	 * One day: Use an atom table to track array indices,
534	 * and avoid the extra memory overhead.
535	 */
536	bucket->flags |= MALLOC;
537	bucket->ahname_ref = 1;
538	emalloc(bucket->ahname_str, char *, subs->stlen + 2, "assoc_lookup");
539	bucket->ahname_len = subs->stlen;
540
541	memcpy(bucket->ahname_str, subs->stptr, subs->stlen);
542	bucket->ahname_str[bucket->ahname_len] = '\0';
543
544	free_temp(subs);
545
546	bucket->ahvalue = Nnull_string;
547	bucket->ahnext = symbol->var_array[hash1];
548	symbol->var_array[hash1] = bucket;
549	return &(bucket->ahvalue);
550}
551
552/* do_delete --- perform `delete array[s]' */
553
554/*
555 * `symbol' is array
556 * `tree' is subscript
557 */
558
559void
560do_delete(NODE *sym, NODE *tree)
561{
562	register int hash1;
563	register NODE *bucket, *last;
564	NODE *subs;
565	register NODE *symbol = get_array(sym);
566
567	if (tree == NULL) {	/* delete array */
568		assoc_clear(symbol);
569		return;
570	}
571
572	last = NULL;	/* shut up gcc -Wall */
573	hash1 = 0;	/* ditto */
574
575	/*
576	 * Always evaluate subscript, it could have side effects.
577	 */
578	subs = concat_exp(tree);	/* concat_exp returns string node */
579
580	if (symbol->var_array != NULL) {
581		hash1 = hash(subs->stptr, subs->stlen,
582				(unsigned long) symbol->array_size);
583		last = NULL;
584		for (bucket = symbol->var_array[hash1]; bucket != NULL;
585				last = bucket, bucket = bucket->ahnext) {
586			/*
587			 * This used to use cmp_nodes() here.  That's wrong.
588			 * Array indexes are strings; compare as such, always!
589			 */
590			const char *s1_str;
591			size_t s1_len;
592			NODE *s2;
593
594			s1_str = bucket->ahname_str;
595			s1_len = bucket->ahname_len;
596			s2 = subs;
597
598			if (s1_len == s2->stlen) {
599				if (s1_len == 0		/* "" is a valid index */
600				    || STREQN(s1_str, s2->stptr, s1_len))
601					break;
602			}
603		}
604	} else
605		bucket = NULL;	/* The array is empty.  */
606
607	if (bucket == NULL) {
608		if (do_lint)
609			lintwarn(_("delete: index `%s' not in array `%s'"),
610				subs->stptr, array_vname(sym));
611		free_temp(subs);
612		return;
613	}
614
615	free_temp(subs);
616
617	if (last != NULL)
618		last->ahnext = bucket->ahnext;
619	else
620		symbol->var_array[hash1] = bucket->ahnext;
621	unref(bucket->ahvalue);
622	unref(bucket);	/* unref() will free the ahname_str */
623	symbol->table_size--;
624	if (symbol->table_size <= 0) {
625		memset(symbol->var_array, '\0',
626			sizeof(NODE *) * symbol->array_size);
627		symbol->table_size = symbol->array_size = 0;
628		symbol->flags &= ~ARRAYMAXED;
629		free((char *) symbol->var_array);
630		symbol->var_array = NULL;
631	}
632}
633
634/* do_delete_loop --- simulate ``for (iggy in foo) delete foo[iggy]'' */
635
636/*
637 * The primary hassle here is that `iggy' needs to have some arbitrary
638 * array index put in it before we can clear the array, we can't
639 * just replace the loop with `delete foo'.
640 */
641
642void
643do_delete_loop(NODE *symbol, NODE *tree)
644{
645	size_t i;
646	NODE **lhs;
647	Func_ptr after_assign = NULL;
648
649	symbol = get_array(symbol);
650
651	if (symbol->var_array == NULL)
652		return;
653
654	/* get first index value */
655	for (i = 0; i < symbol->array_size; i++) {
656		if (symbol->var_array[i] != NULL) {
657			lhs = get_lhs(tree->lnode, & after_assign, FALSE);
658			unref(*lhs);
659			*lhs = make_string(symbol->var_array[i]->ahname_str,
660					symbol->var_array[i]->ahname_len);
661			if (after_assign)
662				(*after_assign)();
663			break;
664		}
665	}
666
667	/* blast the array in one shot */
668	assoc_clear(symbol);
669}
670
671/* grow_table --- grow a hash table */
672
673static void
674grow_table(NODE *symbol)
675{
676	NODE **old, **new, *chain, *next;
677	int i, j;
678	unsigned long hash1;
679	unsigned long oldsize, newsize;
680	/*
681	 * This is an array of primes. We grow the table by an order of
682	 * magnitude each time (not just doubling) so that growing is a
683	 * rare operation. We expect, on average, that it won't happen
684	 * more than twice.  The final size is also chosen to be small
685	 * enough so that MS-DOG mallocs can handle it. When things are
686	 * very large (> 8K), we just double more or less, instead of
687	 * just jumping from 8K to 64K.
688	 */
689	static const long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497,
690#if ! defined(MSDOS) && ! defined(OS2) && ! defined(atarist)
691				131101, 262147, 524309, 1048583, 2097169,
692				4194319, 8388617, 16777259, 33554467,
693				67108879, 134217757, 268435459, 536870923,
694				1073741827
695#endif
696	};
697
698	/* find next biggest hash size */
699	newsize = oldsize = symbol->array_size;
700	for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) {
701		if (oldsize < sizes[i]) {
702			newsize = sizes[i];
703			break;
704		}
705	}
706
707	if (newsize == oldsize) {	/* table already at max (!) */
708		symbol->flags |= ARRAYMAXED;
709		return;
710	}
711
712	/* allocate new table */
713	emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table");
714	memset(new, '\0', newsize * sizeof(NODE *));
715
716	/* brand new hash table, set things up and return */
717	if (symbol->var_array == NULL) {
718		symbol->table_size = 0;
719		goto done;
720	}
721
722	/* old hash table there, move stuff to new, free old */
723	old = symbol->var_array;
724	for (i = 0; i < oldsize; i++) {
725		if (old[i] == NULL)
726			continue;
727
728		for (chain = old[i]; chain != NULL; chain = next) {
729			next = chain->ahnext;
730			hash1 = hash(chain->ahname_str,
731					chain->ahname_len, newsize);
732
733			/* remove from old list, add to new */
734			chain->ahnext = new[hash1];
735			new[hash1] = chain;
736		}
737	}
738	free(old);
739
740done:
741	/*
742	 * note that symbol->table_size does not change if an old array,
743	 * and is explicitly set to 0 if a new one.
744	 */
745	symbol->var_array = new;
746	symbol->array_size = newsize;
747}
748
749/* pr_node --- print simple node info */
750
751static void
752pr_node(NODE *n)
753{
754	if ((n->flags & (NUMCUR|NUMBER)) != 0)
755		printf("%g", n->numbr);
756	else
757		printf("%.*s", (int) n->stlen, n->stptr);
758}
759
760/* assoc_dump --- dump the contents of an array */
761
762NODE *
763assoc_dump(NODE *symbol)
764{
765	int i;
766	NODE *bucket;
767
768	if (symbol->var_array == NULL) {
769		printf(_("%s: empty (null)\n"), symbol->vname);
770		return tmp_number((AWKNUM) 0);
771	}
772
773	if (symbol->table_size == 0) {
774		printf(_("%s: empty (zero)\n"), symbol->vname);
775		return tmp_number((AWKNUM) 0);
776	}
777
778	printf(_("%s: table_size = %d, array_size = %d\n"), symbol->vname,
779			(int) symbol->table_size, (int) symbol->array_size);
780
781	for (i = 0; i < symbol->array_size; i++) {
782		for (bucket = symbol->var_array[i]; bucket != NULL;
783				bucket = bucket->ahnext) {
784			printf("%s: I: [len %d <%.*s>] V: [",
785				symbol->vname,
786				(int) bucket->ahname_len,
787				(int) bucket->ahname_len,
788				bucket->ahname_str);
789			pr_node(bucket->ahvalue);
790			printf("]\n");
791		}
792	}
793
794	return tmp_number((AWKNUM) 0);
795}
796
797/* do_adump --- dump an array: interface to assoc_dump */
798
799NODE *
800do_adump(NODE *tree)
801{
802	NODE *r, *a;
803
804	a = tree->lnode;
805
806	if (a->type == Node_param_list) {
807		printf(_("%s: is parameter\n"), a->vname);
808		a = stack_ptr[a->param_cnt];
809	}
810
811	if (a->type == Node_array_ref) {
812		printf(_("%s: array_ref to %s\n"), a->vname,
813					a->orig_array->vname);
814		a = a->orig_array;
815	}
816
817	r = assoc_dump(a);
818
819	return r;
820}
821
822/*
823 * The following functions implement the builtin
824 * asort function.  Initial work by Alan J. Broder,
825 * ajb@woti.com.
826 */
827
828/* dup_table --- duplicate input symbol table "symbol" */
829
830static void
831dup_table(NODE *symbol, NODE *newsymb)
832{
833	NODE **old, **new, *chain, *bucket;
834	int i;
835	unsigned long cursize;
836
837	/* find the current hash size */
838	cursize = symbol->array_size;
839
840	new = NULL;
841
842	/* input is a brand new hash table, so there's nothing to copy */
843	if (symbol->var_array == NULL)
844		newsymb->table_size = 0;
845	else {
846		/* old hash table there, dupnode stuff into a new table */
847
848		/* allocate new table */
849		emalloc(new, NODE **, cursize * sizeof(NODE *), "dup_table");
850		memset(new, '\0', cursize * sizeof(NODE *));
851
852		/* do the copying/dupnode'ing */
853		old = symbol->var_array;
854		for (i = 0; i < cursize; i++) {
855			if (old[i] != NULL) {
856				for (chain = old[i]; chain != NULL;
857						chain = chain->ahnext) {
858					/* get a node for the linked list */
859					getnode(bucket);
860					bucket->type = Node_ahash;
861					bucket->flags |= MALLOC;
862					bucket->ahname_ref = 1;
863
864					/*
865					 * copy the corresponding name and
866					 * value from the original input list
867					 */
868					emalloc(bucket->ahname_str, char *, chain->ahname_len + 2, "dup_table");
869					bucket->ahname_len = chain->ahname_len;
870
871					memcpy(bucket->ahname_str, chain->ahname_str, chain->ahname_len);
872					bucket->ahname_str[bucket->ahname_len] = '\0';
873
874					bucket->ahvalue = dupnode(chain->ahvalue);
875
876					/*
877					 * put the node on the corresponding
878					 * linked list in the new table
879					 */
880					bucket->ahnext = new[i];
881					new[i] = bucket;
882				}
883			}
884		}
885		newsymb->table_size = symbol->table_size;
886	}
887
888	newsymb->var_array = new;
889	newsymb->array_size = cursize;
890}
891
892/* merge --- do a merge of two sorted lists */
893
894static NODE *
895merge(NODE *left, NODE *right)
896{
897	NODE *ans, *cur;
898
899	/*
900	 * The use of cmp_nodes() here means that IGNORECASE influences the
901	 * comparison.  This is OK, but it may be surprising.  This comment
902	 * serves to remind us that we know about this and that it's OK.
903	 */
904	if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
905		ans = cur = left;
906		left = left->ahnext;
907	} else {
908		ans = cur = right;
909		right = right->ahnext;
910	}
911
912	while (left != NULL && right != NULL) {
913		if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
914			cur->ahnext = left;
915			cur = left;
916			left  = left->ahnext;
917		} else {
918			cur->ahnext = right;
919			cur = right;
920			right = right->ahnext;
921		}
922	}
923
924	cur->ahnext = (left != NULL ? left : right);
925
926	return ans;
927}
928
929/* merge_sort --- recursively sort the left and right sides of a list */
930
931static NODE *
932merge_sort(NODE *left, int size)
933{
934	NODE *right, *tmp;
935	int i, half;
936
937	if (size <= 1)
938		return left;
939
940	/* walk down the list, till just one before the midpoint */
941	tmp = left;
942	half = size / 2;
943	for (i = 0; i < half-1; i++)
944		tmp = tmp->ahnext;
945
946	/* split the list into two parts */
947	right = tmp->ahnext;
948	tmp->ahnext = NULL;
949
950	/* sort the left and right parts of the list */
951	left  = merge_sort(left,       half);
952	right = merge_sort(right, size-half);
953
954	/* merge the two sorted parts of the list */
955	return merge(left, right);
956}
957
958
959/*
960 * assoc_from_list -- Populate an array with the contents of a list of NODEs,
961 * using increasing integers as the key.
962 */
963
964static void
965assoc_from_list(NODE *symbol, NODE *list)
966{
967	NODE *next;
968	unsigned long i = 0;
969	register int hash1;
970	char buf[100];
971
972	for (; list != NULL; list = next) {
973		next = list->ahnext;
974
975		/* make an int out of i++ */
976		i++;
977		sprintf(buf, "%lu", i);
978		assert(list->ahname_str == NULL);
979		assert(list->ahname_ref == 1);
980		emalloc(list->ahname_str, char *, strlen(buf) + 2, "assoc_from_list");
981		list->ahname_len = strlen(buf);
982		strcpy(list->ahname_str, buf);
983
984		/* find the bucket where it belongs */
985		hash1 = hash(list->ahname_str, list->ahname_len,
986				symbol->array_size);
987
988		/* link the node into the chain at that bucket */
989		list->ahnext = symbol->var_array[hash1];
990		symbol->var_array[hash1] = list;
991	}
992}
993
994/*
995 * assoc_sort_inplace --- sort all the values in symbol[], replacing
996 * the sorted values back into symbol[], indexed by integers starting with 1.
997 */
998
999typedef enum asort_how { VALUE, INDEX } ASORT_TYPE;
1000
1001static NODE *
1002assoc_sort_inplace(NODE *symbol, ASORT_TYPE how)
1003{
1004	int i, num;
1005	NODE *bucket, *next, *list;
1006
1007	if (symbol->var_array == NULL
1008	    || symbol->array_size <= 0
1009	    || symbol->table_size <= 0)
1010		return tmp_number((AWKNUM) 0);
1011
1012	/* build a linked list out of all the entries in the table */
1013	if (how == VALUE) {
1014		list = NULL;
1015		num = 0;
1016		for (i = 0; i < symbol->array_size; i++) {
1017			for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
1018				next = bucket->ahnext;
1019				if (bucket->ahname_ref == 1) {
1020					free(bucket->ahname_str);
1021					bucket->ahname_str = NULL;
1022					bucket->ahname_len = 0;
1023				} else {
1024					NODE *r;
1025
1026					getnode(r);
1027					*r = *bucket;
1028					unref(bucket);
1029					bucket = r;
1030					bucket->flags |= MALLOC;
1031					bucket->ahname_ref = 1;
1032					bucket->ahname_str = NULL;
1033					bucket->ahname_len = 0;
1034				}
1035				bucket->ahnext = list;
1036				list = bucket;
1037				num++;
1038			}
1039			symbol->var_array[i] = NULL;
1040		}
1041	} else {	/* how == INDEX */
1042		list = NULL;
1043		num = 0;
1044		for (i = 0; i < symbol->array_size; i++) {
1045			for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
1046				next = bucket->ahnext;
1047
1048				/* toss old value */
1049				unref(bucket->ahvalue);
1050
1051				/* move index into value */
1052				if (bucket->ahname_ref == 1) {
1053					bucket->ahvalue = make_str_node(bucket->ahname_str,
1054								bucket->ahname_len, ALREADY_MALLOCED);
1055					bucket->ahname_str = NULL;
1056					bucket->ahname_len = 0;
1057				} else {
1058					NODE *r;
1059
1060					bucket->ahvalue = make_string(bucket->ahname_str, bucket->ahname_len);
1061					getnode(r);
1062					*r = *bucket;
1063					unref(bucket);
1064					bucket = r;
1065					bucket->flags |= MALLOC;
1066					bucket->ahname_ref = 1;
1067					bucket->ahname_str = NULL;
1068					bucket->ahname_len = 0;
1069				}
1070
1071				bucket->ahnext = list;
1072				list = bucket;
1073				num++;
1074			}
1075			symbol->var_array[i] = NULL;
1076		}
1077	}
1078
1079	/*
1080	 * Sort the linked list of NODEs.
1081	 * (The especially nice thing about using a merge sort here is that
1082	 * we require absolutely no additional storage. This is handy if the
1083	 * array has grown to be very large.)
1084	 */
1085	list = merge_sort(list, num);
1086
1087	/*
1088	 * now repopulate the original array, using increasing
1089	 * integers as the key
1090	 */
1091	assoc_from_list(symbol, list);
1092
1093	return tmp_number((AWKNUM) num);
1094}
1095
1096/* asort_actual --- do the actual work to sort the input array */
1097
1098static NODE *
1099asort_actual(NODE *tree, ASORT_TYPE how)
1100{
1101	NODE *array = get_array(tree->lnode);
1102
1103	if (tree->rnode != NULL) {  /* 2nd optional arg */
1104		NODE *dest = get_array(tree->rnode->lnode);
1105
1106		assoc_clear(dest);
1107		dup_table(array, dest);
1108		array = dest;
1109	}
1110
1111	return assoc_sort_inplace(array, how);
1112}
1113
1114/* do_asort --- sort array by value */
1115
1116NODE *
1117do_asort(NODE *tree)
1118{
1119	return asort_actual(tree, VALUE);
1120}
1121
1122/* do_asorti --- sort array by index */
1123
1124NODE *
1125do_asorti(NODE *tree)
1126{
1127	return asort_actual(tree, INDEX);
1128}
1129
1130/*
1131From bonzini@gnu.org  Mon Oct 28 16:05:26 2002
1132Date: Mon, 28 Oct 2002 13:33:03 +0100
1133From: Paolo Bonzini <bonzini@gnu.org>
1134To: arnold@skeeve.com
1135Subject: Hash function
1136Message-ID: <20021028123303.GA6832@biancaneve>
1137
1138Here is the hash function I'm using in GNU Smalltalk.  The scrambling is
1139needed if you use powers of two as the table sizes.  If you use primes it
1140is not needed.
1141
1142To use double-hashing with power-of-two size, you should use the
1143_gst_hash_string(str, len) as the primary hash and
1144scramble(_gst_hash_string (str, len)) | 1 as the secondary hash.
1145
1146Paolo
1147
1148*/
1149/*
1150 * ADR: Slightly modified to work w/in the context of gawk.
1151 */
1152
1153static unsigned long
1154gst_hash_string(const char *str, size_t len, unsigned long hsize)
1155{
1156	unsigned long hashVal = 1497032417;    /* arbitrary value */
1157	unsigned long ret;
1158
1159	while (len--) {
1160		hashVal += *str++;
1161		hashVal += (hashVal << 10);
1162		hashVal ^= (hashVal >> 6);
1163	}
1164
1165	ret = scramble(hashVal);
1166	if (ret >= hsize)
1167		ret %= hsize;
1168
1169	return ret;
1170}
1171
1172static unsigned long
1173scramble(unsigned long x)
1174{
1175	if (sizeof(long) == 4) {
1176		int y = ~x;
1177
1178		x += (y << 10) | (y >> 22);
1179		x += (x << 6)  | (x >> 26);
1180		x -= (x << 16) | (x >> 16);
1181	} else {
1182		x ^= (~x) >> 31;
1183		x += (x << 21) | (x >> 11);
1184		x += (x << 5) | (x >> 27);
1185		x += (x << 27) | (x >> 5);
1186		x += (x << 31);
1187	}
1188
1189	return x;
1190}
1191