string_table.c revision 1618:8c9a4f31d225
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <string_table.h>
30#include <strings.h>
31#include <sgs.h>
32#include <stdio.h>
33
34
35
36/*
37 * This file provides the interfaces to build a Str_tbl suitable
38 * for use by either the sgsmsg system or a standard ELF
39 * SHT_STRTAB.
40 *
41 * There are two modes which can be used when constructing a
42 * string table:
43 *
44 *	st_new(0)
45 *		standard string table - no compression.  This is the
46 *		traditional method and fast
47 *
48 *	st_new(FLG_STNEW_COMPRESS)
49 *		build a compressed string table which both
50 *		eliminates duplicate strings and permits
51 *		strings with common suffixes (atexit vs. exit) to
52 *		overlap in the table.  This provides space
53 *		savings for many string tables.
54 *
55 * These string tables are now built with a common interface in a
56 * two-pass manner, the first pass it to find all of the strings
57 * required for the string-table and to calculate the size that
58 * will be required for the final string table.
59 *
60 * The second pass allocates the string table and populates the
61 * strings into the table and returns the offsets the strings
62 * have been assigned.
63 *
64 * The calling sequence to build and populate a string table is:
65 *
66 *		st_new();		// initialize strtab
67 *
68 *		st_insert(st1);		// first pass of strings ...
69 *					// calculates size required for
70 *					// string table
71 *
72 *		st_delstring(st?);	// remove string previously
73 *					// inserted
74 *
75 *		st_insert(stN);
76 *
77 *		st_getstrtab_sz();	// freezes strtab and computes
78 *					// size of table.
79 *
80 *		st_setstrbuf();		// associates a final destination
81 *					// for the string table
82 *
83 *		st_setstring(st1);	// populate the string table
84 *		...			// offsets are based off of second
85 *					// pass	through the string table
86 *		st_setstring(stN);
87 *
88 *		st_destroy();		// tear down string table
89 *					// structures.
90 *
91 * String Suffix Compression Algorithm:
92 *
93 *   Here's a quick high level overview of the Suffix String
94 *   compression algorithm used.  First - the heart of the algorithm
95 *   is a Hash table list which represents a dictionary of all unique
96 *   strings inserted into the string table.  The hash function for
97 *   this table is a standard string hash except that the hash starts
98 *   at the last character in the string (&str[n - 1]) and works towards
99 *   the first character in the function (&str[0]).  As we compute the
100 *   HASH value for a given string, we also compute the hash values
101 *   for all of the possible suffix strings for that string.
102 *
103 *   As we compute the hash - at each character see if the current
104 *   suffix string for that hash is already present in the table.  If
105 *   it is, and the string is a master string.  Then change that
106 *   string to a suffix string of the new string being inserted.
107 *
108 *   When the final hash value is found (hash for str[0...n]), check
109 *   to see if it is in the hash table - if so increment the reference
110 *   count for the string.  If it is not yet in the table, insert a
111 *   new hash table entry for a master string.
112 *
113 *   The above method will find all suffixes of a given string given
114 *   that the strings are inserted from shortest to longest.  That is
115 *   why this is a two phase method, we first collect all of the
116 *   strings and store them based off of their length in a nice AVL tree.
117 *   Once all of the strings have been submitted we then start the
118 *   hash table build by traversing the AVL tree in order and
119 *   inserting the strings from shortest to longest as described
120 *   above.
121 *
122 */
123
124/* LINTLIBRARY */
125
126
127int
128strlen_compare(const void *elem1, const void *elem2)
129{
130	uint_t	l1, l2;
131	l1 = ((Stringelem *)elem1)->se_stlen;
132	l2 = ((Stringelem *)elem2)->se_stlen;
133
134	if (l1 == l2)
135		return (0);
136	if (l2 < l1)
137		return (1);
138
139	return (-1);
140}
141
142/*
143 * Return a initialized Str_tbl - returns NULL on failure.
144 *
145 * stflags:
146 *
147 *	FLG_STNEW_COMPRESS - build a compressed string table
148 *
149 */
150Str_tbl *
151st_new(uint_t stflags)
152{
153	Str_tbl	*stp;
154
155	if ((stp = calloc(sizeof (Str_tbl), 1)) == 0)
156		return (0);
157
158	/*
159	 * Start with a leading '\0' - it's tradition.
160	 */
161	stp->st_stringsize = stp->st_fullstringsize = stp->st_nextoff = 1;
162
163	/*
164	 * Do we compress this string table
165	 */
166	if ((stflags & FLG_STNEW_COMPRESS) == 0)
167		return (stp);
168
169	stp->st_flags |= FLG_STTAB_COMPRESS;
170	if ((stp->st_strtree = calloc(sizeof (avl_tree_t), 1)) == 0) {
171		return (0);
172	}
173
174	avl_create(stp->st_strtree, &strlen_compare, sizeof (Stringelem),
175		SGSOFFSETOF(Stringelem, se_avlnode));
176
177	return (stp);
178}
179
180/*
181 * Tear down a String_Table structure.
182 */
183void
184st_destroy(Str_tbl *stp)
185{
186	Str_hash	*sthash, *psthash;
187	Str_master	*mstr, *pmstr;
188	uint_t		i;
189
190	/*
191	 * cleanup the master strings
192	 */
193	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
194	    mstr = mstr->sm_next) {
195		if (pmstr)
196			free(pmstr);
197		pmstr = mstr;
198	}
199	if (pmstr)
200		free(pmstr);
201
202	if (stp->st_hashbcks) {
203		for (i = 0; i < stp->st_hbckcnt; i++) {
204			for (sthash = stp->st_hashbcks[i], psthash = 0;
205			    sthash; sthash = sthash->hi_next) {
206				if (psthash)
207					free(psthash);
208				psthash = sthash;
209			}
210			if (psthash)
211				free(psthash);
212		}
213		free(stp->st_hashbcks);
214	}
215	free(stp);
216}
217
218
219
220
221/*
222 * Remove a previously inserted string from the Str_tbl
223 */
224int
225st_delstring(Str_tbl *stp, const char *str)
226{
227	uint_t		stlen;
228	Stringelem	qstelem;
229	Stringelem	*stelem;
230	Stringlist	*stlist, *pstlist;
231
232	/*
233	 * String table can't have been cooked
234	 */
235	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
236
237	stlen = (uint_t)strlen(str);
238	stp->st_fullstringsize -= stlen + 1;
239
240	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
241		return (0);
242
243	qstelem.se_stlen = stlen;
244	if ((stelem = avl_find(stp->st_strtree, &qstelem, 0)) == NULL) {
245		/*
246		 * no strings of this length recorded, let alone
247		 * this specific string - someone goofed.
248		 */
249		return (-1);
250	}
251
252	pstlist = 0;
253	for (stlist = stelem->se_strlist; stlist; stlist = stlist->sl_next) {
254		if (strcmp(str, stlist->sl_string) == 0)
255			break;
256		pstlist = stlist;
257	}
258
259	if (stlist == 0) {
260		/*
261		 * string was not found
262		 */
263		return (-1);
264	}
265
266	if (pstlist == 0) {
267		/*
268		 * String is first on list.
269		 */
270		stelem->se_strlist = stlist->sl_next;
271	} else {
272		/*
273		 * remove string from list.
274		 */
275		pstlist->sl_next = stlist->sl_next;
276	}
277
278	free(stlist);
279	return (0);
280}
281
282
283/*
284 * Insert a new string into the Str_tbl
285 */
286int
287st_insert(Str_tbl *stp, const char *str)
288{
289	uint_t	stlen;
290	Stringelem	qstelem;
291	Stringelem	*stelem;
292	Stringlist	*strlist;
293	avl_index_t	where;
294
295	/*
296	 * String table can't have been cooked
297	 */
298	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
299	stlen = (uint_t)strlen(str);
300	/*
301	 * Null strings always point to the head of the string
302	 * table - no reason to keep searching.
303	 */
304	if (stlen == 0)
305		return (0);
306
307	stp->st_fullstringsize += stlen + 1;
308	stp->st_stringcnt++;
309
310	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
311		return (0);
312
313	qstelem.se_stlen = strlen(str);
314	if ((stelem = avl_find(stp->st_strtree, &qstelem,
315	    &where)) == NULL) {
316		if ((stelem = calloc(sizeof (Stringelem), 1)) == 0)
317			return (-1);
318		stelem->se_stlen = qstelem.se_stlen;
319		avl_insert(stp->st_strtree, stelem, where);
320	}
321	if ((strlist = malloc(sizeof (Stringlist))) == 0)
322		return (-1);
323
324	strlist->sl_string = str;
325	strlist->sl_next = stelem->se_strlist;
326	stelem->se_strlist = strlist;
327
328	return (0);
329}
330
331
332/*
333 * For a given string - copy it into the buffer associated with
334 * the string table - and return the offset it has been assigned.
335 *
336 * If a value of '-1' is returned - the string was not found in
337 * the Str_tbl.
338 */
339int
340st_setstring(Str_tbl *stp, const char *str, uint_t *stoff)
341{
342	uint_t		stlen;
343	uint_t		hashval;
344	Str_hash	*sthash;
345	Str_master	*mstr;
346	int		i;
347
348	/*
349	 * String table *must* have been previously cooked
350	 */
351	assert(stp->st_strbuf);
352
353	assert(stp->st_flags & FLG_STTAB_COOKED);
354	stlen = (uint_t)strlen(str);
355	/*
356	 * Null string always points to head of string table
357	 */
358	if (stlen == 0) {
359		*stoff = 0;
360		return (0);
361	}
362
363	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
364		uint_t		_stoff;
365
366		stlen++;	/* count for trailing '\0' */
367		_stoff = stp->st_nextoff;
368		/*
369		 * Have we overflowed our assigned buffer?
370		 */
371		if ((_stoff + stlen) > stp->st_fullstringsize)
372			return (-1);
373		memcpy(stp->st_strbuf + _stoff, str, stlen);
374		*stoff = _stoff;
375		stp->st_nextoff += stlen;
376		return (0);
377	}
378
379	/*
380	 * Calculate reverse hash for string
381	 */
382	hashval = HASHSEED;
383	for (i = stlen; i >= 0; i--) {
384		hashval = ((hashval << 5) + hashval) +
385			str[i];			/* h = ((h * 33) + c) */
386	}
387
388	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
389	    sthash = sthash->hi_next) {
390		if (sthash->hi_hashval == hashval) {
391			const char	*hstr;
392
393			hstr = &sthash->hi_mstr->sm_str[
394			    sthash->hi_mstr->sm_stlen -
395			    sthash->hi_stlen];
396			if (strcmp(str, hstr) == 0) {
397				break;
398			}
399		}
400	}
401
402	/*
403	 * Did we find the string?
404	 */
405	if (sthash == 0)
406		return (-1);
407
408	/*
409	 * Has this string been copied into the string table?
410	 */
411	mstr = sthash->hi_mstr;
412	if (mstr->sm_stoff == 0) {
413		uint_t	mstlen = mstr->sm_stlen + 1;
414		mstr->sm_stoff = stp->st_nextoff;
415		/*
416		 * Have we overflowed our assigned buffer?
417		 */
418		if ((mstr->sm_stoff + mstlen) > stp->st_fullstringsize)
419			return (-1);
420		memcpy(stp->st_strbuf + mstr->sm_stoff, mstr->sm_str,
421			mstlen);
422		stp->st_nextoff += mstlen;
423	}
424	/*
425	 * Calculate offset of (sub)string
426	 */
427	*stoff = mstr->sm_stoff + mstr->sm_stlen - sthash->hi_stlen;
428
429	return (0);
430}
431
432
433static int
434st_hash_insert(Str_tbl *stp, const char *str, uint_t stlen)
435{
436	int		i;
437	uint_t		hashval = HASHSEED;
438	uint_t		bckcnt = stp->st_hbckcnt;
439	Str_hash	**hashbcks = stp->st_hashbcks;
440	Str_hash	*sthash;
441	Str_master	*mstr = 0;
442
443	/*
444	 * We use a classic 'Bernstein k=33' hash function.  But
445	 * instead of hashing from the start of the string to the
446	 * end, we do it in reverse.
447	 *
448	 * This way - we are essentially building all of the
449	 * suffix hashvalues as we go.  We can check to see if
450	 * any suffixes already exist in the tree as we generate
451	 * the hash.
452	 */
453	for (i = stlen; i >= 0; i--) {
454
455		hashval = ((hashval << 5) + hashval) +
456			str[i];			/* h = ((h * 33) + c) */
457		for (sthash = hashbcks[hashval % bckcnt];
458		    sthash; sthash = sthash->hi_next) {
459
460			if (sthash->hi_hashval == hashval) {
461				const char	*hstr;
462				Str_master	*_mstr;
463
464				_mstr = sthash->hi_mstr;
465				hstr = &_mstr->sm_str[_mstr->sm_stlen -
466				    sthash->hi_stlen];
467				if (strcmp(&str[i], hstr) == 0) {
468					if (i == 0) {
469						/*
470						 * Entry already in table,
471						 * increment refcnt and get
472						 * out.
473						 */
474						sthash->hi_refcnt++;
475						return (0);
476					} else {
477						/*
478						 * If this 'suffix' is
479						 * presently a 'master' string,
480						 * then take over it's record.
481						 */
482						if (sthash->hi_stlen ==
483						    _mstr->sm_stlen) {
484							/*
485							 * we should only do
486							 * this once.
487							 */
488							assert(mstr == 0);
489							mstr = _mstr;
490						}
491					}
492				}
493			}
494		}
495	}
496
497
498	/*
499	 * Do we need a new master string, or can we take over
500	 * one we already found in the table?
501	 */
502	if (mstr == 0) {
503		/*
504		 * allocate a new master string
505		 */
506		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
507			return (-1);
508		mstr->sm_next = stp->st_mstrlist;
509		stp->st_mstrlist = mstr;
510		stp->st_stringsize += stlen + 1;
511	} else {
512		/*
513		 * We are taking over a existing master string,
514		 * the stringsize only increments by the
515		 * difference between the currnet string and the
516		 * previous master.
517		 */
518		assert(stlen > mstr->sm_stlen);
519		stp->st_stringsize += stlen - mstr->sm_stlen;
520	}
521
522	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
523		return (-1);
524
525	mstr->sm_hashval = sthash->hi_hashval = hashval;
526	mstr->sm_stlen = sthash->hi_stlen = stlen;
527	mstr->sm_str = str;
528	sthash->hi_refcnt = 1;
529	sthash->hi_mstr = mstr;
530
531	/*
532	 * Insert string element into head of hash list
533	 */
534	hashval = hashval % bckcnt;
535	sthash->hi_next = hashbcks[hashval];
536	hashbcks[hashval] = sthash;
537	return (0);
538}
539
540/*
541 * Return amount of space required for the string table.
542 */
543uint_t
544st_getstrtab_sz(Str_tbl *stp)
545{
546	assert(stp->st_fullstringsize > 0);
547
548	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
549		stp->st_flags |= FLG_STTAB_COOKED;
550		return (stp->st_fullstringsize);
551	}
552
553
554	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
555		Stringelem	*stelem;
556		void		*cookie;
557
558		stp->st_flags |= FLG_STTAB_COOKED;
559		/*
560		 * allocate a hash table about the size of # of
561		 * strings input.
562		 */
563		stp->st_hbckcnt = findprime(stp->st_stringcnt);
564		if ((stp->st_hashbcks =
565		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
566			return (0);
567
568		/*
569		 * We now walk all of the strings in the list,
570		 * from shortest to longest, and insert them into
571		 * the hashtable.
572		 */
573		if ((stelem = avl_first(stp->st_strtree)) == NULL) {
574			/*
575			 * Is it possible we have a empty string table,
576			 * if so - the table still conains '\0'
577			 * so still return the size.
578			 */
579			if (avl_numnodes(stp->st_strtree) == 0) {
580				assert(stp->st_stringsize == 1);
581				return (stp->st_stringsize);
582			}
583			return (0);
584		}
585		while (stelem) {
586			Stringlist	*strlist, *pstrlist;
587
588			/*
589			 * Walk the string lists and insert them
590			 * into the hash list.  Once a string is
591			 * inserted we no longer need it's entry,
592			 * so free it
593			 */
594			for (strlist = stelem->se_strlist, pstrlist = 0;
595			    strlist; strlist = strlist->sl_next) {
596				if (st_hash_insert(stp, strlist->sl_string,
597				    stelem->se_stlen) == -1)
598					return (0);
599				if (pstrlist)
600					free(pstrlist);
601			}
602			free(pstrlist);
603			stelem->se_strlist = 0;
604			stelem = AVL_NEXT(stp->st_strtree, stelem);
605		}
606
607		/*
608		 * Now that all of the strings have been freed,
609		 * go ahead and quickly re-walk the AVL tree and
610		 * free all of the AVL nodes.
611		 *
612		 * avl_destroy_nodes() beats avl_remove() because
613		 * avl_remove will 'ballance' the tree as nodes
614		 * are deleted - we just want to tear the whole
615		 * thing down now.
616		 */
617		cookie = NULL;
618		while ((stelem = avl_destroy_nodes(stp->st_strtree,
619		    &cookie)) != NULL)
620			free(stelem);
621		avl_destroy(stp->st_strtree);
622		free(stp->st_strtree);
623		stp->st_strtree = 0;
624	}
625
626	assert(stp->st_stringsize > 0);
627	assert(stp->st_fullstringsize >= stp->st_stringsize);
628
629	return (stp->st_stringsize);
630}
631
632/*
633 * Associate a buffer with the string table.
634 */
635const char *
636st_getstrbuf(Str_tbl *stp)
637{
638	return (stp->st_strbuf);
639}
640
641int
642st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize)
643{
644	assert(stp->st_flags & FLG_STTAB_COOKED);
645
646	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
647		if (bufsize < stp->st_fullstringsize)
648			return (-1);
649	} else {
650		if (bufsize < stp->st_stringsize)
651			return (-1);
652	}
653
654	stp->st_strbuf = stbuf;
655#ifdef	DEBUG
656	/*
657	 * for debug builds - start with a stringtable filled in
658	 * with '0xff'.  This makes it very easy to find wholes
659	 * which we failed to fill in - in the strtab.
660	 */
661	memset(stbuf, 0xff, bufsize);
662	stbuf[0] = '\0';
663#else
664	memset(stbuf, 0x0, bufsize);
665#endif
666	return (0);
667}
668