xref: /onnv-gate/usr/src/cmd/sgs/tools/common/string_table.c (revision 5549:beb29939b34a)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51618Srie  * Common Development and Distribution License (the "License").
61618Srie  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
211618Srie 
220Sstevel@tonic-gate /*
23*5549Srie  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
29*5549Srie #include <_string_table.h>
300Sstevel@tonic-gate #include <strings.h>
310Sstevel@tonic-gate #include <sgs.h>
320Sstevel@tonic-gate #include <stdio.h>
330Sstevel@tonic-gate 
340Sstevel@tonic-gate /*
35*5549Srie  * This file provides the interfaces to build a Str_tbl suitable for use by
36*5549Srie  * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
37*5549Srie  * as created by ld(1).
380Sstevel@tonic-gate  *
39*5549Srie  * There are two modes which can be used when constructing a string table:
400Sstevel@tonic-gate  *
410Sstevel@tonic-gate  *	st_new(0)
420Sstevel@tonic-gate  *		standard string table - no compression.  This is the
43*5549Srie  *		traditional, fast method.
440Sstevel@tonic-gate  *
45*5549Srie  *	st_new(FLG_STTAB_COMPRESS)
46*5549Srie  *		builds a compressed string table which both eliminates
47*5549Srie  *		duplicate strings, and permits strings with common suffixes
48*5549Srie  *		(atexit vs. exit) to overlap in the table.  This provides space
49*5549Srie  *		savings for many string tables.  Although more work than the
50*5549Srie  *		traditional method, the algorithms used are designed to scale
51*5549Srie  *		and keep any overhead at a minimum.
520Sstevel@tonic-gate  *
53*5549Srie  * These string tables are built with a common interface in a two-pass manner.
54*5549Srie  * The first pass finds all of the strings required for the string-table and
55*5549Srie  * calculates the size required for the final string table.
56*5549Srie  *
57*5549Srie  * The second pass allocates the string table, populates the strings into the
58*5549Srie  * table and returns the offsets the strings have been assigned.
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  * The calling sequence to build and populate a string table is:
610Sstevel@tonic-gate  *
620Sstevel@tonic-gate  *		st_new();		// initialize strtab
630Sstevel@tonic-gate  *
640Sstevel@tonic-gate  *		st_insert(st1);		// first pass of strings ...
650Sstevel@tonic-gate  *					// calculates size required for
660Sstevel@tonic-gate  *					// string table
670Sstevel@tonic-gate  *
680Sstevel@tonic-gate  *		st_delstring(st?);	// remove string previously
690Sstevel@tonic-gate  *					// inserted
700Sstevel@tonic-gate  *		st_insert(stN);
710Sstevel@tonic-gate  *
720Sstevel@tonic-gate  *		st_getstrtab_sz();	// freezes strtab and computes
730Sstevel@tonic-gate  *					// size of table.
740Sstevel@tonic-gate  *
750Sstevel@tonic-gate  *		st_setstrbuf();		// associates a final destination
760Sstevel@tonic-gate  *					// for the string table
770Sstevel@tonic-gate  *
780Sstevel@tonic-gate  *		st_setstring(st1);	// populate the string table
790Sstevel@tonic-gate  *		...			// offsets are based off of second
800Sstevel@tonic-gate  *					// pass	through the string table
810Sstevel@tonic-gate  *		st_setstring(stN);
820Sstevel@tonic-gate  *
830Sstevel@tonic-gate  *		st_destroy();		// tear down string table
840Sstevel@tonic-gate  *					// structures.
850Sstevel@tonic-gate  *
860Sstevel@tonic-gate  * String Suffix Compression Algorithm:
870Sstevel@tonic-gate  *
880Sstevel@tonic-gate  *   Here's a quick high level overview of the Suffix String
890Sstevel@tonic-gate  *   compression algorithm used.  First - the heart of the algorithm
900Sstevel@tonic-gate  *   is a Hash table list which represents a dictionary of all unique
910Sstevel@tonic-gate  *   strings inserted into the string table.  The hash function for
920Sstevel@tonic-gate  *   this table is a standard string hash except that the hash starts
930Sstevel@tonic-gate  *   at the last character in the string (&str[n - 1]) and works towards
940Sstevel@tonic-gate  *   the first character in the function (&str[0]).  As we compute the
950Sstevel@tonic-gate  *   HASH value for a given string, we also compute the hash values
960Sstevel@tonic-gate  *   for all of the possible suffix strings for that string.
970Sstevel@tonic-gate  *
980Sstevel@tonic-gate  *   As we compute the hash - at each character see if the current
990Sstevel@tonic-gate  *   suffix string for that hash is already present in the table.  If
1000Sstevel@tonic-gate  *   it is, and the string is a master string.  Then change that
1010Sstevel@tonic-gate  *   string to a suffix string of the new string being inserted.
1020Sstevel@tonic-gate  *
1030Sstevel@tonic-gate  *   When the final hash value is found (hash for str[0...n]), check
1040Sstevel@tonic-gate  *   to see if it is in the hash table - if so increment the reference
1050Sstevel@tonic-gate  *   count for the string.  If it is not yet in the table, insert a
1060Sstevel@tonic-gate  *   new hash table entry for a master string.
1070Sstevel@tonic-gate  *
1080Sstevel@tonic-gate  *   The above method will find all suffixes of a given string given
1090Sstevel@tonic-gate  *   that the strings are inserted from shortest to longest.  That is
1100Sstevel@tonic-gate  *   why this is a two phase method, we first collect all of the
111*5549Srie  *   strings and store them based off of their length in an AVL tree.
1120Sstevel@tonic-gate  *   Once all of the strings have been submitted we then start the
1130Sstevel@tonic-gate  *   hash table build by traversing the AVL tree in order and
1140Sstevel@tonic-gate  *   inserting the strings from shortest to longest as described
1150Sstevel@tonic-gate  *   above.
1160Sstevel@tonic-gate  */
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate /* LINTLIBRARY */
1190Sstevel@tonic-gate 
120*5549Srie static int
121*5549Srie avl_len_compare(const void *n1, const void *n2)
1220Sstevel@tonic-gate {
123*5549Srie 	uint_t	len1, len2;
124*5549Srie 
125*5549Srie 	len1 = ((LenNode *)n1)->ln_strlen;
126*5549Srie 	len2 = ((LenNode *)n2)->ln_strlen;
1270Sstevel@tonic-gate 
128*5549Srie 	if (len1 == len2)
1290Sstevel@tonic-gate 		return (0);
130*5549Srie 	if (len2 < len1)
1310Sstevel@tonic-gate 		return (1);
1320Sstevel@tonic-gate 	return (-1);
1330Sstevel@tonic-gate }
1340Sstevel@tonic-gate 
135*5549Srie static int
136*5549Srie avl_str_compare(const void *n1, const void *n2)
137*5549Srie {
138*5549Srie 	const char	*str1, *str2;
139*5549Srie 	int		rc;
140*5549Srie 
141*5549Srie 	str1 = ((StrNode *)n1)->sn_str;
142*5549Srie 	str2 = ((StrNode *)n2)->sn_str;
143*5549Srie 
144*5549Srie 	rc = strcmp(str1, str2);
145*5549Srie 	if (rc > 0)
146*5549Srie 		return (1);
147*5549Srie 	if (rc < 0)
148*5549Srie 		return (-1);
149*5549Srie 	return (0);
150*5549Srie }
151*5549Srie 
1520Sstevel@tonic-gate /*
153*5549Srie  * Return an initialized Str_tbl - returns NULL on failure.
1540Sstevel@tonic-gate  *
155*5549Srie  * flags:
156*5549Srie  *	FLG_STTAB_COMPRESS - build a compressed string table
1570Sstevel@tonic-gate  */
1580Sstevel@tonic-gate Str_tbl *
159*5549Srie st_new(uint_t flags)
1600Sstevel@tonic-gate {
1610Sstevel@tonic-gate 	Str_tbl	*stp;
1620Sstevel@tonic-gate 
163*5549Srie 	if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL)
164*5549Srie 		return (NULL);
1650Sstevel@tonic-gate 
1660Sstevel@tonic-gate 	/*
1670Sstevel@tonic-gate 	 * Start with a leading '\0' - it's tradition.
1680Sstevel@tonic-gate 	 */
169*5549Srie 	stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
1700Sstevel@tonic-gate 
1710Sstevel@tonic-gate 	/*
172*5549Srie 	 * Do we compress this string table?
1730Sstevel@tonic-gate 	 */
174*5549Srie 	stp->st_flags = flags;
175*5549Srie 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
1760Sstevel@tonic-gate 		return (stp);
1770Sstevel@tonic-gate 
178*5549Srie 	if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL)
179*5549Srie 		return (NULL);
180*5549Srie 
181*5549Srie 	avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
182*5549Srie 	    SGSOFFSETOF(LenNode, ln_avlnode));
183*5549Srie 
184*5549Srie 	return (stp);
185*5549Srie }
186*5549Srie 
187*5549Srie /*
188*5549Srie  * Insert a new string into the Str_tbl.  There are two AVL trees used.
189*5549Srie  *
190*5549Srie  *  .	The first LenNode AVL tree maintains a tree of nodes based on string
191*5549Srie  *	sizes.
192*5549Srie  *  .	Each LenNode maintains a StrNode AVL tree for each string.  Large
193*5549Srie  *	applications have been known to contribute thousands of strings of
194*5549Srie  *	the same size.  Should strings need to be removed (-z ignore), then
195*5549Srie  *	the string AVL tree makes this removal efficient and scalable.
196*5549Srie  */
197*5549Srie int
198*5549Srie st_insert(Str_tbl *stp, const char *str)
199*5549Srie {
200*5549Srie 	uint_t		len;
201*5549Srie 	StrNode		*snp, sn = { 0 };
202*5549Srie 	LenNode		*lnp, ln = { 0 };
203*5549Srie 	avl_index_t	where;
204*5549Srie 
205*5549Srie 	/*
206*5549Srie 	 * String table can't have been cooked
207*5549Srie 	 */
208*5549Srie 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
209*5549Srie 
210*5549Srie 	/*
211*5549Srie 	 * Null strings always point to the head of the string
212*5549Srie 	 * table - no reason to keep searching.
213*5549Srie 	 */
214*5549Srie 	if ((len = (uint_t)strlen(str)) == 0)
2150Sstevel@tonic-gate 		return (0);
216*5549Srie 
217*5549Srie 	stp->st_fullstrsize += len + 1;
218*5549Srie 	stp->st_strcnt++;
219*5549Srie 
220*5549Srie 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
221*5549Srie 		return (0);
222*5549Srie 
223*5549Srie 	/*
224*5549Srie 	 * From the controlling string table, determine which LenNode AVL node
225*5549Srie 	 * provides for this string length.  If the node doesn't exist, insert
226*5549Srie 	 * a new node to represent this string length.
227*5549Srie 	 */
228*5549Srie 	ln.ln_strlen = len;
229*5549Srie 	if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
230*5549Srie 		if ((lnp = calloc(sizeof (LenNode), 1)) == NULL)
231*5549Srie 			return (-1);
232*5549Srie 		lnp->ln_strlen = len;
233*5549Srie 		avl_insert(stp->st_lentree, lnp, where);
234*5549Srie 
235*5549Srie 		if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL)
236*5549Srie 			return (0);
237*5549Srie 
238*5549Srie 		avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
239*5549Srie 		    SGSOFFSETOF(StrNode, sn_avlnode));
2400Sstevel@tonic-gate 	}
2410Sstevel@tonic-gate 
242*5549Srie 	/*
243*5549Srie 	 * From the string length AVL node determine whether a StrNode AVL node
244*5549Srie 	 * provides this string.  If the node doesn't exist, insert a new node
245*5549Srie 	 * to represent this string.
246*5549Srie 	 */
247*5549Srie 	sn.sn_str = str;
248*5549Srie 	if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
249*5549Srie 		if ((snp = calloc(sizeof (StrNode), 1)) == NULL)
250*5549Srie 			return (-1);
251*5549Srie 		snp->sn_str = str;
252*5549Srie 		avl_insert(lnp->ln_strtree, snp, where);
253*5549Srie 	}
254*5549Srie 	snp->sn_refcnt++;
255*5549Srie 
256*5549Srie 	return (0);
257*5549Srie }
258*5549Srie 
259*5549Srie /*
260*5549Srie  * Remove a previously inserted string from the Str_tbl.
261*5549Srie  */
262*5549Srie int
263*5549Srie st_delstring(Str_tbl *stp, const char *str)
264*5549Srie {
265*5549Srie 	uint_t		len;
266*5549Srie 	LenNode		*lnp, ln = { 0 };
267*5549Srie 	StrNode		*snp, sn = { 0 };
2680Sstevel@tonic-gate 
269*5549Srie 	/*
270*5549Srie 	 * String table can't have been cooked
271*5549Srie 	 */
272*5549Srie 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
273*5549Srie 
274*5549Srie 	len = (uint_t)strlen(str);
275*5549Srie 	stp->st_fullstrsize -= len + 1;
276*5549Srie 
277*5549Srie 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
278*5549Srie 		return (0);
279*5549Srie 
280*5549Srie 	/*
281*5549Srie 	 * Determine which LenNode AVL node provides for this string length.
282*5549Srie 	 */
283*5549Srie 	ln.ln_strlen = len;
284*5549Srie 	if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
285*5549Srie 		sn.sn_str = str;
286*5549Srie 		if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
287*5549Srie 			/*
288*5549Srie 			 * Reduce the reference count, and if zero remove the
289*5549Srie 			 * node.
290*5549Srie 			 */
291*5549Srie 			if (--snp->sn_refcnt == 0)
292*5549Srie 				avl_remove(lnp->ln_strtree, snp);
293*5549Srie 			return (0);
294*5549Srie 		}
295*5549Srie 	}
296*5549Srie 
297*5549Srie 	/*
298*5549Srie 	 * No strings of this length, or no string itself - someone goofed.
299*5549Srie 	 */
300*5549Srie 	return (-1);
3010Sstevel@tonic-gate }
3020Sstevel@tonic-gate 
3030Sstevel@tonic-gate /*
3040Sstevel@tonic-gate  * Tear down a String_Table structure.
3050Sstevel@tonic-gate  */
3060Sstevel@tonic-gate void
3070Sstevel@tonic-gate st_destroy(Str_tbl *stp)
3080Sstevel@tonic-gate {
3090Sstevel@tonic-gate 	Str_hash	*sthash, *psthash;
3100Sstevel@tonic-gate 	Str_master	*mstr, *pmstr;
3110Sstevel@tonic-gate 	uint_t		i;
3120Sstevel@tonic-gate 
3130Sstevel@tonic-gate 	/*
3140Sstevel@tonic-gate 	 * cleanup the master strings
3150Sstevel@tonic-gate 	 */
3160Sstevel@tonic-gate 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
3170Sstevel@tonic-gate 	    mstr = mstr->sm_next) {
3180Sstevel@tonic-gate 		if (pmstr)
3190Sstevel@tonic-gate 			free(pmstr);
3200Sstevel@tonic-gate 		pmstr = mstr;
3210Sstevel@tonic-gate 	}
3220Sstevel@tonic-gate 	if (pmstr)
3230Sstevel@tonic-gate 		free(pmstr);
3240Sstevel@tonic-gate 
3250Sstevel@tonic-gate 	if (stp->st_hashbcks) {
3260Sstevel@tonic-gate 		for (i = 0; i < stp->st_hbckcnt; i++) {
3270Sstevel@tonic-gate 			for (sthash = stp->st_hashbcks[i], psthash = 0;
3280Sstevel@tonic-gate 			    sthash; sthash = sthash->hi_next) {
3290Sstevel@tonic-gate 				if (psthash)
3300Sstevel@tonic-gate 					free(psthash);
3310Sstevel@tonic-gate 				psthash = sthash;
3320Sstevel@tonic-gate 			}
3330Sstevel@tonic-gate 			if (psthash)
3340Sstevel@tonic-gate 				free(psthash);
3350Sstevel@tonic-gate 		}
3360Sstevel@tonic-gate 		free(stp->st_hashbcks);
3370Sstevel@tonic-gate 	}
3380Sstevel@tonic-gate 	free(stp);
3390Sstevel@tonic-gate }
3400Sstevel@tonic-gate 
3410Sstevel@tonic-gate 
3420Sstevel@tonic-gate /*
3430Sstevel@tonic-gate  * For a given string - copy it into the buffer associated with
3440Sstevel@tonic-gate  * the string table - and return the offset it has been assigned.
3450Sstevel@tonic-gate  *
3460Sstevel@tonic-gate  * If a value of '-1' is returned - the string was not found in
3470Sstevel@tonic-gate  * the Str_tbl.
3480Sstevel@tonic-gate  */
3490Sstevel@tonic-gate int
3500Sstevel@tonic-gate st_setstring(Str_tbl *stp, const char *str, uint_t *stoff)
3510Sstevel@tonic-gate {
3520Sstevel@tonic-gate 	uint_t		stlen;
3530Sstevel@tonic-gate 	uint_t		hashval;
3540Sstevel@tonic-gate 	Str_hash	*sthash;
3550Sstevel@tonic-gate 	Str_master	*mstr;
3560Sstevel@tonic-gate 	int		i;
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 	/*
3590Sstevel@tonic-gate 	 * String table *must* have been previously cooked
3600Sstevel@tonic-gate 	 */
3610Sstevel@tonic-gate 	assert(stp->st_strbuf);
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 	assert(stp->st_flags & FLG_STTAB_COOKED);
3640Sstevel@tonic-gate 	stlen = (uint_t)strlen(str);
3650Sstevel@tonic-gate 	/*
3660Sstevel@tonic-gate 	 * Null string always points to head of string table
3670Sstevel@tonic-gate 	 */
3680Sstevel@tonic-gate 	if (stlen == 0) {
3690Sstevel@tonic-gate 		*stoff = 0;
3700Sstevel@tonic-gate 		return (0);
3710Sstevel@tonic-gate 	}
3720Sstevel@tonic-gate 
3730Sstevel@tonic-gate 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
3740Sstevel@tonic-gate 		uint_t		_stoff;
3750Sstevel@tonic-gate 
3760Sstevel@tonic-gate 		stlen++;	/* count for trailing '\0' */
3770Sstevel@tonic-gate 		_stoff = stp->st_nextoff;
3780Sstevel@tonic-gate 		/*
3790Sstevel@tonic-gate 		 * Have we overflowed our assigned buffer?
3800Sstevel@tonic-gate 		 */
381*5549Srie 		if ((_stoff + stlen) > stp->st_fullstrsize)
3820Sstevel@tonic-gate 			return (-1);
3830Sstevel@tonic-gate 		memcpy(stp->st_strbuf + _stoff, str, stlen);
3840Sstevel@tonic-gate 		*stoff = _stoff;
3850Sstevel@tonic-gate 		stp->st_nextoff += stlen;
3860Sstevel@tonic-gate 		return (0);
3870Sstevel@tonic-gate 	}
3880Sstevel@tonic-gate 
3890Sstevel@tonic-gate 	/*
390*5549Srie 	 * Calculate reverse hash for string.
3910Sstevel@tonic-gate 	 */
3920Sstevel@tonic-gate 	hashval = HASHSEED;
3930Sstevel@tonic-gate 	for (i = stlen; i >= 0; i--) {
3940Sstevel@tonic-gate 		hashval = ((hashval << 5) + hashval) +
395*5549Srie 		    str[i];			/* h = ((h * 33) + c) */
3960Sstevel@tonic-gate 	}
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
3990Sstevel@tonic-gate 	    sthash = sthash->hi_next) {
400*5549Srie 		const char	*hstr;
401*5549Srie 
402*5549Srie 		if (sthash->hi_hashval != hashval)
403*5549Srie 			continue;
4040Sstevel@tonic-gate 
405*5549Srie 		hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
406*5549Srie 		    sthash->hi_strlen];
407*5549Srie 		if (strcmp(str, hstr) == 0)
408*5549Srie 			break;
4090Sstevel@tonic-gate 	}
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate 	/*
4120Sstevel@tonic-gate 	 * Did we find the string?
4130Sstevel@tonic-gate 	 */
4140Sstevel@tonic-gate 	if (sthash == 0)
4150Sstevel@tonic-gate 		return (-1);
4160Sstevel@tonic-gate 
4170Sstevel@tonic-gate 	/*
4180Sstevel@tonic-gate 	 * Has this string been copied into the string table?
4190Sstevel@tonic-gate 	 */
4200Sstevel@tonic-gate 	mstr = sthash->hi_mstr;
421*5549Srie 	if (mstr->sm_stroff == 0) {
422*5549Srie 		uint_t	mstrlen = mstr->sm_strlen + 1;
423*5549Srie 
424*5549Srie 		mstr->sm_stroff = stp->st_nextoff;
425*5549Srie 
4260Sstevel@tonic-gate 		/*
4270Sstevel@tonic-gate 		 * Have we overflowed our assigned buffer?
4280Sstevel@tonic-gate 		 */
429*5549Srie 		if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
4300Sstevel@tonic-gate 			return (-1);
431*5549Srie 
432*5549Srie 		(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
433*5549Srie 		    mstr->sm_str, mstrlen);
434*5549Srie 		stp->st_nextoff += mstrlen;
4350Sstevel@tonic-gate 	}
436*5549Srie 
4370Sstevel@tonic-gate 	/*
438*5549Srie 	 * Calculate offset of (sub)string.
4390Sstevel@tonic-gate 	 */
440*5549Srie 	*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
4410Sstevel@tonic-gate 
4420Sstevel@tonic-gate 	return (0);
4430Sstevel@tonic-gate }
4440Sstevel@tonic-gate 
4450Sstevel@tonic-gate 
4460Sstevel@tonic-gate static int
447*5549Srie st_hash_insert(Str_tbl *stp, const char *str, uint_t len)
4480Sstevel@tonic-gate {
4490Sstevel@tonic-gate 	int		i;
4500Sstevel@tonic-gate 	uint_t		hashval = HASHSEED;
4510Sstevel@tonic-gate 	uint_t		bckcnt = stp->st_hbckcnt;
4520Sstevel@tonic-gate 	Str_hash	**hashbcks = stp->st_hashbcks;
4530Sstevel@tonic-gate 	Str_hash	*sthash;
4540Sstevel@tonic-gate 	Str_master	*mstr = 0;
4550Sstevel@tonic-gate 
4560Sstevel@tonic-gate 	/*
4570Sstevel@tonic-gate 	 * We use a classic 'Bernstein k=33' hash function.  But
4580Sstevel@tonic-gate 	 * instead of hashing from the start of the string to the
4590Sstevel@tonic-gate 	 * end, we do it in reverse.
4600Sstevel@tonic-gate 	 *
4610Sstevel@tonic-gate 	 * This way - we are essentially building all of the
4620Sstevel@tonic-gate 	 * suffix hashvalues as we go.  We can check to see if
4630Sstevel@tonic-gate 	 * any suffixes already exist in the tree as we generate
4640Sstevel@tonic-gate 	 * the hash.
4650Sstevel@tonic-gate 	 */
466*5549Srie 	for (i = len; i >= 0; i--) {
467*5549Srie 		hashval = ((hashval << 5) + hashval) +
468*5549Srie 		    str[i];			/* h = ((h * 33) + c) */
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate 		for (sthash = hashbcks[hashval % bckcnt];
4710Sstevel@tonic-gate 		    sthash; sthash = sthash->hi_next) {
472*5549Srie 			const char	*hstr;
473*5549Srie 			Str_master	*_mstr;
4740Sstevel@tonic-gate 
475*5549Srie 			if (sthash->hi_hashval != hashval)
476*5549Srie 				continue;
477*5549Srie 
478*5549Srie 			_mstr = sthash->hi_mstr;
479*5549Srie 			hstr = &_mstr->sm_str[_mstr->sm_strlen -
480*5549Srie 			    sthash->hi_strlen];
481*5549Srie 
482*5549Srie 			if (strcmp(&str[i], hstr))
483*5549Srie 				continue;
4840Sstevel@tonic-gate 
485*5549Srie 			if (i == 0) {
486*5549Srie 				/*
487*5549Srie 				 * Entry already in table, increment refcnt and
488*5549Srie 				 * get out.
489*5549Srie 				 */
490*5549Srie 				sthash->hi_refcnt++;
491*5549Srie 				return (0);
492*5549Srie 			} else {
493*5549Srie 				/*
494*5549Srie 				 * If this 'suffix' is presently a 'master
495*5549Srie 				 * string, then take over it's record.
496*5549Srie 				 */
497*5549Srie 				if (sthash->hi_strlen == _mstr->sm_strlen) {
498*5549Srie 					/*
499*5549Srie 					 * we should only do this once.
500*5549Srie 					 */
501*5549Srie 					assert(mstr == 0);
502*5549Srie 					mstr = _mstr;
5030Sstevel@tonic-gate 				}
5040Sstevel@tonic-gate 			}
5050Sstevel@tonic-gate 		}
5060Sstevel@tonic-gate 	}
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	/*
5090Sstevel@tonic-gate 	 * Do we need a new master string, or can we take over
5100Sstevel@tonic-gate 	 * one we already found in the table?
5110Sstevel@tonic-gate 	 */
5120Sstevel@tonic-gate 	if (mstr == 0) {
5130Sstevel@tonic-gate 		/*
5140Sstevel@tonic-gate 		 * allocate a new master string
5150Sstevel@tonic-gate 		 */
5160Sstevel@tonic-gate 		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
5170Sstevel@tonic-gate 			return (-1);
5180Sstevel@tonic-gate 		mstr->sm_next = stp->st_mstrlist;
5190Sstevel@tonic-gate 		stp->st_mstrlist = mstr;
520*5549Srie 		stp->st_strsize += len + 1;
5210Sstevel@tonic-gate 	} else {
5220Sstevel@tonic-gate 		/*
523*5549Srie 		 * We are taking over a existing master string, the string size
524*5549Srie 		 * only increments by the difference between the current string
525*5549Srie 		 * and the previous master.
5260Sstevel@tonic-gate 		 */
527*5549Srie 		assert(len > mstr->sm_strlen);
528*5549Srie 		stp->st_strsize += len - mstr->sm_strlen;
5290Sstevel@tonic-gate 	}
5300Sstevel@tonic-gate 
5310Sstevel@tonic-gate 	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
5320Sstevel@tonic-gate 		return (-1);
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate 	mstr->sm_hashval = sthash->hi_hashval = hashval;
535*5549Srie 	mstr->sm_strlen = sthash->hi_strlen = len;
5360Sstevel@tonic-gate 	mstr->sm_str = str;
5370Sstevel@tonic-gate 	sthash->hi_refcnt = 1;
5380Sstevel@tonic-gate 	sthash->hi_mstr = mstr;
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 	/*
5410Sstevel@tonic-gate 	 * Insert string element into head of hash list
5420Sstevel@tonic-gate 	 */
5430Sstevel@tonic-gate 	hashval = hashval % bckcnt;
5440Sstevel@tonic-gate 	sthash->hi_next = hashbcks[hashval];
5450Sstevel@tonic-gate 	hashbcks[hashval] = sthash;
5460Sstevel@tonic-gate 	return (0);
5470Sstevel@tonic-gate }
5480Sstevel@tonic-gate 
5490Sstevel@tonic-gate /*
5500Sstevel@tonic-gate  * Return amount of space required for the string table.
5510Sstevel@tonic-gate  */
5520Sstevel@tonic-gate uint_t
5530Sstevel@tonic-gate st_getstrtab_sz(Str_tbl *stp)
5540Sstevel@tonic-gate {
555*5549Srie 	assert(stp->st_fullstrsize > 0);
5560Sstevel@tonic-gate 
5570Sstevel@tonic-gate 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
5580Sstevel@tonic-gate 		stp->st_flags |= FLG_STTAB_COOKED;
559*5549Srie 		return (stp->st_fullstrsize);
5600Sstevel@tonic-gate 	}
5610Sstevel@tonic-gate 
5620Sstevel@tonic-gate 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
563*5549Srie 		LenNode		*lnp;
5640Sstevel@tonic-gate 		void		*cookie;
5650Sstevel@tonic-gate 
5660Sstevel@tonic-gate 		stp->st_flags |= FLG_STTAB_COOKED;
5670Sstevel@tonic-gate 		/*
5680Sstevel@tonic-gate 		 * allocate a hash table about the size of # of
5690Sstevel@tonic-gate 		 * strings input.
5700Sstevel@tonic-gate 		 */
571*5549Srie 		stp->st_hbckcnt = findprime(stp->st_strcnt);
5720Sstevel@tonic-gate 		if ((stp->st_hashbcks =
5730Sstevel@tonic-gate 		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
5740Sstevel@tonic-gate 			return (0);
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate 		/*
577*5549Srie 		 * We now walk all of the strings in the list, from shortest to
578*5549Srie 		 * longest, and insert them into the hashtable.
5790Sstevel@tonic-gate 		 */
580*5549Srie 		if ((lnp = avl_first(stp->st_lentree)) == NULL) {
5810Sstevel@tonic-gate 			/*
582*5549Srie 			 * Is it possible we have an empty string table, if so,
583*5549Srie 			 * the table still contains '\0', so return the size.
5840Sstevel@tonic-gate 			 */
585*5549Srie 			if (avl_numnodes(stp->st_lentree) == 0) {
586*5549Srie 				assert(stp->st_strsize == 1);
587*5549Srie 				return (stp->st_strsize);
5880Sstevel@tonic-gate 			}
5890Sstevel@tonic-gate 			return (0);
5900Sstevel@tonic-gate 		}
591*5549Srie 
592*5549Srie 		while (lnp) {
593*5549Srie 			StrNode	*snp;
594*5549Srie 
595*5549Srie 			/*
596*5549Srie 			 * Walk the string lists and insert them into the hash
597*5549Srie 			 * list.  Once a string is inserted we no longer need
598*5549Srie 			 * it's entry, so the string can be freed.
599*5549Srie 			 */
600*5549Srie 			for (snp = avl_first(lnp->ln_strtree); snp;
601*5549Srie 			    snp = AVL_NEXT(lnp->ln_strtree, snp)) {
602*5549Srie 				if (st_hash_insert(stp, snp->sn_str,
603*5549Srie 				    lnp->ln_strlen) == -1)
604*5549Srie 					return (0);
605*5549Srie 			}
6060Sstevel@tonic-gate 
6070Sstevel@tonic-gate 			/*
608*5549Srie 			 * Now that the strings have been copied, walk the
609*5549Srie 			 * StrNode tree and free all the AVL nodes.  Note,
610*5549Srie 			 * avl_destroy_nodes() beats avl_remove() as the
611*5549Srie 			 * latter balances the nodes as they are removed.
612*5549Srie 			 * We just want to tear the whole thing down fast.
6130Sstevel@tonic-gate 			 */
614*5549Srie 			cookie = NULL;
615*5549Srie 			while ((snp = avl_destroy_nodes(lnp->ln_strtree,
616*5549Srie 			    &cookie)) != NULL)
617*5549Srie 				free(snp);
618*5549Srie 			avl_destroy(lnp->ln_strtree);
619*5549Srie 			free(lnp->ln_strtree);
620*5549Srie 			lnp->ln_strtree = NULL;
621*5549Srie 
622*5549Srie 			/*
623*5549Srie 			 * Move on to the next LenNode.
624*5549Srie 			 */
625*5549Srie 			lnp = AVL_NEXT(stp->st_lentree, lnp);
6260Sstevel@tonic-gate 		}
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate 		/*
629*5549Srie 		 * Now that all of the strings have been freed, walk the
630*5549Srie 		 * LenNode tree and free all of the AVL nodes.  Note,
631*5549Srie 		 * avl_destroy_nodes() beats avl_remove() as the latter
632*5549Srie 		 * balances the nodes as they are removed. We just want to
633*5549Srie 		 * tear the whole thing down fast.
6340Sstevel@tonic-gate 		 */
6350Sstevel@tonic-gate 		cookie = NULL;
636*5549Srie 		while ((lnp = avl_destroy_nodes(stp->st_lentree,
6370Sstevel@tonic-gate 		    &cookie)) != NULL)
638*5549Srie 			free(lnp);
639*5549Srie 		avl_destroy(stp->st_lentree);
640*5549Srie 		free(stp->st_lentree);
641*5549Srie 		stp->st_lentree = 0;
6420Sstevel@tonic-gate 	}
6430Sstevel@tonic-gate 
644*5549Srie 	assert(stp->st_strsize > 0);
645*5549Srie 	assert(stp->st_fullstrsize >= stp->st_strsize);
6460Sstevel@tonic-gate 
647*5549Srie 	return (stp->st_strsize);
6480Sstevel@tonic-gate }
6490Sstevel@tonic-gate 
6500Sstevel@tonic-gate /*
651*5549Srie  * Associate a buffer with a string table.
6520Sstevel@tonic-gate  */
6530Sstevel@tonic-gate const char *
6540Sstevel@tonic-gate st_getstrbuf(Str_tbl *stp)
6550Sstevel@tonic-gate {
6560Sstevel@tonic-gate 	return (stp->st_strbuf);
6570Sstevel@tonic-gate }
6580Sstevel@tonic-gate 
6590Sstevel@tonic-gate int
6600Sstevel@tonic-gate st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize)
6610Sstevel@tonic-gate {
6620Sstevel@tonic-gate 	assert(stp->st_flags & FLG_STTAB_COOKED);
6630Sstevel@tonic-gate 
6640Sstevel@tonic-gate 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
665*5549Srie 		if (bufsize < stp->st_fullstrsize)
6660Sstevel@tonic-gate 			return (-1);
6670Sstevel@tonic-gate 	} else {
668*5549Srie 		if (bufsize < stp->st_strsize)
6690Sstevel@tonic-gate 			return (-1);
6700Sstevel@tonic-gate 	}
6710Sstevel@tonic-gate 
6720Sstevel@tonic-gate 	stp->st_strbuf = stbuf;
6730Sstevel@tonic-gate #ifdef	DEBUG
6740Sstevel@tonic-gate 	/*
6750Sstevel@tonic-gate 	 * for debug builds - start with a stringtable filled in
6760Sstevel@tonic-gate 	 * with '0xff'.  This makes it very easy to find wholes
6770Sstevel@tonic-gate 	 * which we failed to fill in - in the strtab.
6780Sstevel@tonic-gate 	 */
6790Sstevel@tonic-gate 	memset(stbuf, 0xff, bufsize);
6800Sstevel@tonic-gate 	stbuf[0] = '\0';
6810Sstevel@tonic-gate #else
6820Sstevel@tonic-gate 	memset(stbuf, 0x0, bufsize);
6830Sstevel@tonic-gate #endif
6840Sstevel@tonic-gate 	return (0);
6850Sstevel@tonic-gate }
686