1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <_string_table.h>
30#include <strings.h>
31#include <sgs.h>
32#include <stdio.h>
33
34/*
35 * This file provides the interfaces to build a Str_tbl suitable for use by
36 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
37 * as created by ld(1).
38 *
39 * There are two modes which can be used when constructing a string table:
40 *
41 *	st_new(0)
42 *		standard string table - no compression.  This is the
43 *		traditional, fast method.
44 *
45 *	st_new(FLG_STTAB_COMPRESS)
46 *		builds a compressed string table which both eliminates
47 *		duplicate strings, and permits strings with common suffixes
48 *		(atexit vs. exit) to overlap in the table.  This provides space
49 *		savings for many string tables.  Although more work than the
50 *		traditional method, the algorithms used are designed to scale
51 *		and keep any overhead at a minimum.
52 *
53 * These string tables are built with a common interface in a two-pass manner.
54 * The first pass finds all of the strings required for the string-table and
55 * calculates the size required for the final string table.
56 *
57 * The second pass allocates the string table, populates the strings into the
58 * table and returns the offsets the strings have been assigned.
59 *
60 * The calling sequence to build and populate a string table is:
61 *
62 *		st_new();		// initialize strtab
63 *
64 *		st_insert(st1);		// first pass of strings ...
65 *					// calculates size required for
66 *					// string table
67 *
68 *		st_delstring(st?);	// remove string previously
69 *					// inserted
70 *		st_insert(stN);
71 *
72 *		st_getstrtab_sz();	// freezes strtab and computes
73 *					// size of table.
74 *
75 *		st_setstrbuf();		// associates a final destination
76 *					// for the string table
77 *
78 *		st_setstring(st1);	// populate the string table
79 *		...			// offsets are based off of second
80 *					// pass	through the string table
81 *		st_setstring(stN);
82 *
83 *		st_destroy();		// tear down string table
84 *					// structures.
85 *
86 * String Suffix Compression Algorithm:
87 *
88 *   Here's a quick high level overview of the Suffix String
89 *   compression algorithm used.  First - the heart of the algorithm
90 *   is a Hash table list which represents a dictionary of all unique
91 *   strings inserted into the string table.  The hash function for
92 *   this table is a standard string hash except that the hash starts
93 *   at the last character in the string (&str[n - 1]) and works towards
94 *   the first character in the function (&str[0]).  As we compute the
95 *   HASH value for a given string, we also compute the hash values
96 *   for all of the possible suffix strings for that string.
97 *
98 *   As we compute the hash - at each character see if the current
99 *   suffix string for that hash is already present in the table.  If
100 *   it is, and the string is a master string.  Then change that
101 *   string to a suffix string of the new string being inserted.
102 *
103 *   When the final hash value is found (hash for str[0...n]), check
104 *   to see if it is in the hash table - if so increment the reference
105 *   count for the string.  If it is not yet in the table, insert a
106 *   new hash table entry for a master string.
107 *
108 *   The above method will find all suffixes of a given string given
109 *   that the strings are inserted from shortest to longest.  That is
110 *   why this is a two phase method, we first collect all of the
111 *   strings and store them based off of their length in an AVL tree.
112 *   Once all of the strings have been submitted we then start the
113 *   hash table build by traversing the AVL tree in order and
114 *   inserting the strings from shortest to longest as described
115 *   above.
116 */
117
118/* LINTLIBRARY */
119
120static int
121avl_len_compare(const void *n1, const void *n2)
122{
123	size_t	len1, len2;
124
125	len1 = ((LenNode *)n1)->ln_strlen;
126	len2 = ((LenNode *)n2)->ln_strlen;
127
128	if (len1 == len2)
129		return (0);
130	if (len2 < len1)
131		return (1);
132	return (-1);
133}
134
135static int
136avl_str_compare(const void *n1, const void *n2)
137{
138	const char	*str1, *str2;
139	int		rc;
140
141	str1 = ((StrNode *)n1)->sn_str;
142	str2 = ((StrNode *)n2)->sn_str;
143
144	rc = strcmp(str1, str2);
145	if (rc > 0)
146		return (1);
147	if (rc < 0)
148		return (-1);
149	return (0);
150}
151
152/*
153 * Return an initialized Str_tbl - returns NULL on failure.
154 *
155 * flags:
156 *	FLG_STTAB_COMPRESS - build a compressed string table
157 */
158Str_tbl *
159st_new(uint_t flags)
160{
161	Str_tbl	*stp;
162
163	if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL)
164		return (NULL);
165
166	/*
167	 * Start with a leading '\0' - it's tradition.
168	 */
169	stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
170
171	/*
172	 * Do we compress this string table?
173	 */
174	stp->st_flags = flags;
175	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
176		return (stp);
177
178	if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL)
179		return (NULL);
180
181	avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
182	    SGSOFFSETOF(LenNode, ln_avlnode));
183
184	return (stp);
185}
186
187/*
188 * Insert a new string into the Str_tbl.  There are two AVL trees used.
189 *
190 *  .	The first LenNode AVL tree maintains a tree of nodes based on string
191 *	sizes.
192 *  .	Each LenNode maintains a StrNode AVL tree for each string.  Large
193 *	applications have been known to contribute thousands of strings of
194 *	the same size.  Should strings need to be removed (-z ignore), then
195 *	the string AVL tree makes this removal efficient and scalable.
196 */
197int
198st_insert(Str_tbl *stp, const char *str)
199{
200	size_t		len;
201	StrNode		*snp, sn = { 0 };
202	LenNode		*lnp, ln = { 0 };
203	avl_index_t	where;
204
205	/*
206	 * String table can't have been cooked
207	 */
208	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
209
210	/*
211	 * Null strings always point to the head of the string
212	 * table - no reason to keep searching.
213	 */
214	if ((len = strlen(str)) == 0)
215		return (0);
216
217	stp->st_fullstrsize += len + 1;
218	stp->st_strcnt++;
219
220	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
221		return (0);
222
223	/*
224	 * From the controlling string table, determine which LenNode AVL node
225	 * provides for this string length.  If the node doesn't exist, insert
226	 * a new node to represent this string length.
227	 */
228	ln.ln_strlen = len;
229	if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
230		if ((lnp = calloc(sizeof (LenNode), 1)) == NULL)
231			return (-1);
232		lnp->ln_strlen = len;
233		avl_insert(stp->st_lentree, lnp, where);
234
235		if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL)
236			return (0);
237
238		avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
239		    SGSOFFSETOF(StrNode, sn_avlnode));
240	}
241
242	/*
243	 * From the string length AVL node determine whether a StrNode AVL node
244	 * provides this string.  If the node doesn't exist, insert a new node
245	 * to represent this string.
246	 */
247	sn.sn_str = str;
248	if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
249		if ((snp = calloc(sizeof (StrNode), 1)) == NULL)
250			return (-1);
251		snp->sn_str = str;
252		avl_insert(lnp->ln_strtree, snp, where);
253	}
254	snp->sn_refcnt++;
255
256	return (0);
257}
258
259/*
260 * Remove a previously inserted string from the Str_tbl.
261 */
262int
263st_delstring(Str_tbl *stp, const char *str)
264{
265	size_t		len;
266	LenNode		*lnp, ln = { 0 };
267	StrNode		*snp, sn = { 0 };
268
269	/*
270	 * String table can't have been cooked
271	 */
272	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
273
274	len = strlen(str);
275	stp->st_fullstrsize -= len + 1;
276
277	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
278		return (0);
279
280	/*
281	 * Determine which LenNode AVL node provides for this string length.
282	 */
283	ln.ln_strlen = len;
284	if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
285		sn.sn_str = str;
286		if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
287			/*
288			 * Reduce the reference count, and if zero remove the
289			 * node.
290			 */
291			if (--snp->sn_refcnt == 0)
292				avl_remove(lnp->ln_strtree, snp);
293			return (0);
294		}
295	}
296
297	/*
298	 * No strings of this length, or no string itself - someone goofed.
299	 */
300	return (-1);
301}
302
303/*
304 * Tear down a String_Table structure.
305 */
306void
307st_destroy(Str_tbl *stp)
308{
309	Str_hash	*sthash, *psthash;
310	Str_master	*mstr, *pmstr;
311	uint_t		i;
312
313	/*
314	 * cleanup the master strings
315	 */
316	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
317	    mstr = mstr->sm_next) {
318		if (pmstr)
319			free(pmstr);
320		pmstr = mstr;
321	}
322	if (pmstr)
323		free(pmstr);
324
325	if (stp->st_hashbcks) {
326		for (i = 0; i < stp->st_hbckcnt; i++) {
327			for (sthash = stp->st_hashbcks[i], psthash = 0;
328			    sthash; sthash = sthash->hi_next) {
329				if (psthash)
330					free(psthash);
331				psthash = sthash;
332			}
333			if (psthash)
334				free(psthash);
335		}
336		free(stp->st_hashbcks);
337	}
338	free(stp);
339}
340
341
342/*
343 * For a given string - copy it into the buffer associated with
344 * the string table - and return the offset it has been assigned.
345 *
346 * If a value of '-1' is returned - the string was not found in
347 * the Str_tbl.
348 */
349int
350st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
351{
352	size_t		stlen;
353	uint_t		hashval;
354	Str_hash	*sthash;
355	Str_master	*mstr;
356	int		i;
357
358	/*
359	 * String table *must* have been previously cooked
360	 */
361	assert(stp->st_strbuf);
362
363	assert(stp->st_flags & FLG_STTAB_COOKED);
364	stlen = strlen(str);
365	/*
366	 * Null string always points to head of string table
367	 */
368	if (stlen == 0) {
369		*stoff = 0;
370		return (0);
371	}
372
373	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
374		size_t		_stoff;
375
376		stlen++;	/* count for trailing '\0' */
377		_stoff = stp->st_nextoff;
378		/*
379		 * Have we overflowed our assigned buffer?
380		 */
381		if ((_stoff + stlen) > stp->st_fullstrsize)
382			return (-1);
383		memcpy(stp->st_strbuf + _stoff, str, stlen);
384		*stoff = _stoff;
385		stp->st_nextoff += stlen;
386		return (0);
387	}
388
389	/*
390	 * Calculate reverse hash for string.
391	 */
392	hashval = HASHSEED;
393	for (i = stlen; i >= 0; i--) {
394		hashval = ((hashval << 5) + hashval) +
395		    str[i];			/* h = ((h * 33) + c) */
396	}
397
398	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
399	    sthash = sthash->hi_next) {
400		const char	*hstr;
401
402		if (sthash->hi_hashval != hashval)
403			continue;
404
405		hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
406		    sthash->hi_strlen];
407		if (strcmp(str, hstr) == 0)
408			break;
409	}
410
411	/*
412	 * Did we find the string?
413	 */
414	if (sthash == 0)
415		return (-1);
416
417	/*
418	 * Has this string been copied into the string table?
419	 */
420	mstr = sthash->hi_mstr;
421	if (mstr->sm_stroff == 0) {
422		size_t	mstrlen = mstr->sm_strlen + 1;
423
424		mstr->sm_stroff = stp->st_nextoff;
425
426		/*
427		 * Have we overflowed our assigned buffer?
428		 */
429		if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
430			return (-1);
431
432		(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
433		    mstr->sm_str, mstrlen);
434		stp->st_nextoff += mstrlen;
435	}
436
437	/*
438	 * Calculate offset of (sub)string.
439	 */
440	*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
441
442	return (0);
443}
444
445
446static int
447st_hash_insert(Str_tbl *stp, const char *str, size_t len)
448{
449	int		i;
450	uint_t		hashval = HASHSEED;
451	uint_t		bckcnt = stp->st_hbckcnt;
452	Str_hash	**hashbcks = stp->st_hashbcks;
453	Str_hash	*sthash;
454	Str_master	*mstr = 0;
455
456	/*
457	 * We use a classic 'Bernstein k=33' hash function.  But
458	 * instead of hashing from the start of the string to the
459	 * end, we do it in reverse.
460	 *
461	 * This way - we are essentially building all of the
462	 * suffix hashvalues as we go.  We can check to see if
463	 * any suffixes already exist in the tree as we generate
464	 * the hash.
465	 */
466	for (i = len; i >= 0; i--) {
467		hashval = ((hashval << 5) + hashval) +
468		    str[i];			/* h = ((h * 33) + c) */
469
470		for (sthash = hashbcks[hashval % bckcnt];
471		    sthash; sthash = sthash->hi_next) {
472			const char	*hstr;
473			Str_master	*_mstr;
474
475			if (sthash->hi_hashval != hashval)
476				continue;
477
478			_mstr = sthash->hi_mstr;
479			hstr = &_mstr->sm_str[_mstr->sm_strlen -
480			    sthash->hi_strlen];
481
482			if (strcmp(&str[i], hstr))
483				continue;
484
485			if (i == 0) {
486				/*
487				 * Entry already in table, increment refcnt and
488				 * get out.
489				 */
490				sthash->hi_refcnt++;
491				return (0);
492			} else {
493				/*
494				 * If this 'suffix' is presently a 'master
495				 * string, then take over it's record.
496				 */
497				if (sthash->hi_strlen == _mstr->sm_strlen) {
498					/*
499					 * we should only do this once.
500					 */
501					assert(mstr == 0);
502					mstr = _mstr;
503				}
504			}
505		}
506	}
507
508	/*
509	 * Do we need a new master string, or can we take over
510	 * one we already found in the table?
511	 */
512	if (mstr == 0) {
513		/*
514		 * allocate a new master string
515		 */
516		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
517			return (-1);
518		mstr->sm_next = stp->st_mstrlist;
519		stp->st_mstrlist = mstr;
520		stp->st_strsize += len + 1;
521	} else {
522		/*
523		 * We are taking over a existing master string, the string size
524		 * only increments by the difference between the current string
525		 * and the previous master.
526		 */
527		assert(len > mstr->sm_strlen);
528		stp->st_strsize += len - mstr->sm_strlen;
529	}
530
531	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
532		return (-1);
533
534	mstr->sm_hashval = sthash->hi_hashval = hashval;
535	mstr->sm_strlen = sthash->hi_strlen = len;
536	mstr->sm_str = str;
537	sthash->hi_refcnt = 1;
538	sthash->hi_mstr = mstr;
539
540	/*
541	 * Insert string element into head of hash list
542	 */
543	hashval = hashval % bckcnt;
544	sthash->hi_next = hashbcks[hashval];
545	hashbcks[hashval] = sthash;
546	return (0);
547}
548
549/*
550 * Return amount of space required for the string table.
551 */
552size_t
553st_getstrtab_sz(Str_tbl *stp)
554{
555	assert(stp->st_fullstrsize > 0);
556
557	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
558		stp->st_flags |= FLG_STTAB_COOKED;
559		return (stp->st_fullstrsize);
560	}
561
562	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
563		LenNode		*lnp;
564		void		*cookie;
565
566		stp->st_flags |= FLG_STTAB_COOKED;
567		/*
568		 * allocate a hash table about the size of # of
569		 * strings input.
570		 */
571		stp->st_hbckcnt = findprime(stp->st_strcnt);
572		if ((stp->st_hashbcks =
573		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
574			return (0);
575
576		/*
577		 * We now walk all of the strings in the list, from shortest to
578		 * longest, and insert them into the hashtable.
579		 */
580		if ((lnp = avl_first(stp->st_lentree)) == NULL) {
581			/*
582			 * Is it possible we have an empty string table, if so,
583			 * the table still contains '\0', so return the size.
584			 */
585			if (avl_numnodes(stp->st_lentree) == 0) {
586				assert(stp->st_strsize == 1);
587				return (stp->st_strsize);
588			}
589			return (0);
590		}
591
592		while (lnp) {
593			StrNode	*snp;
594
595			/*
596			 * Walk the string lists and insert them into the hash
597			 * list.  Once a string is inserted we no longer need
598			 * it's entry, so the string can be freed.
599			 */
600			for (snp = avl_first(lnp->ln_strtree); snp;
601			    snp = AVL_NEXT(lnp->ln_strtree, snp)) {
602				if (st_hash_insert(stp, snp->sn_str,
603				    lnp->ln_strlen) == -1)
604					return (0);
605			}
606
607			/*
608			 * Now that the strings have been copied, walk the
609			 * StrNode tree and free all the AVL nodes.  Note,
610			 * avl_destroy_nodes() beats avl_remove() as the
611			 * latter balances the nodes as they are removed.
612			 * We just want to tear the whole thing down fast.
613			 */
614			cookie = NULL;
615			while ((snp = avl_destroy_nodes(lnp->ln_strtree,
616			    &cookie)) != NULL)
617				free(snp);
618			avl_destroy(lnp->ln_strtree);
619			free(lnp->ln_strtree);
620			lnp->ln_strtree = NULL;
621
622			/*
623			 * Move on to the next LenNode.
624			 */
625			lnp = AVL_NEXT(stp->st_lentree, lnp);
626		}
627
628		/*
629		 * Now that all of the strings have been freed, walk the
630		 * LenNode tree and free all of the AVL nodes.  Note,
631		 * avl_destroy_nodes() beats avl_remove() as the latter
632		 * balances the nodes as they are removed. We just want to
633		 * tear the whole thing down fast.
634		 */
635		cookie = NULL;
636		while ((lnp = avl_destroy_nodes(stp->st_lentree,
637		    &cookie)) != NULL)
638			free(lnp);
639		avl_destroy(stp->st_lentree);
640		free(stp->st_lentree);
641		stp->st_lentree = 0;
642	}
643
644	assert(stp->st_strsize > 0);
645	assert(stp->st_fullstrsize >= stp->st_strsize);
646
647	return (stp->st_strsize);
648}
649
650/*
651 * Associate a buffer with a string table.
652 */
653const char *
654st_getstrbuf(Str_tbl *stp)
655{
656	return (stp->st_strbuf);
657}
658
659int
660st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
661{
662	assert(stp->st_flags & FLG_STTAB_COOKED);
663
664	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
665		if (bufsize < stp->st_fullstrsize)
666			return (-1);
667	} else {
668		if (bufsize < stp->st_strsize)
669			return (-1);
670	}
671
672	stp->st_strbuf = stbuf;
673#ifdef	DEBUG
674	/*
675	 * for debug builds - start with a stringtable filled in
676	 * with '0xff'.  This makes it very easy to find wholes
677	 * which we failed to fill in - in the strtab.
678	 */
679	memset(stbuf, 0xff, bufsize);
680	stbuf[0] = '\0';
681#else
682	memset(stbuf, 0x0, bufsize);
683#endif
684	return (0);
685}
686