xref: /onnv-gate/usr/src/cmd/ssh/libssh/common/g11n.c (revision 3109:393fae270b2d)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52628Sjp161948  * Common Development and Distribution License (the "License").
62628Sjp161948  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  *
212628Sjp161948  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
220Sstevel@tonic-gate  * Use is subject to license terms.
230Sstevel@tonic-gate  */
240Sstevel@tonic-gate 
250Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #include <errno.h>
280Sstevel@tonic-gate #include <locale.h>
290Sstevel@tonic-gate #include <langinfo.h>
300Sstevel@tonic-gate #include <iconv.h>
310Sstevel@tonic-gate #include <ctype.h>
320Sstevel@tonic-gate #include <strings.h>
330Sstevel@tonic-gate #include <string.h>
340Sstevel@tonic-gate #include <stdio.h>
350Sstevel@tonic-gate #include <stdlib.h>
360Sstevel@tonic-gate #include "includes.h"
370Sstevel@tonic-gate #include "xmalloc.h"
380Sstevel@tonic-gate #include "xlist.h"
390Sstevel@tonic-gate 
400Sstevel@tonic-gate #ifdef MIN
410Sstevel@tonic-gate #undef MIN
420Sstevel@tonic-gate #endif /* MIN */
430Sstevel@tonic-gate 
442705Sjp161948 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
450Sstevel@tonic-gate 
462705Sjp161948 #define	LOCALE_PATH	"/usr/bin/locale"
470Sstevel@tonic-gate 
482705Sjp161948 /* two-char country code, '-' and two-char region code */
492705Sjp161948 #define	LANGTAG_MAX	5
500Sstevel@tonic-gate 
512705Sjp161948 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
522705Sjp161948     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
530Sstevel@tonic-gate 
540Sstevel@tonic-gate static int locale_cmp(const void *d1, const void *d2);
550Sstevel@tonic-gate static char *g11n_locale2langtag(char *locale);
560Sstevel@tonic-gate 
572705Sjp161948 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
582705Sjp161948 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
590Sstevel@tonic-gate 
602705Sjp161948 static char *
610Sstevel@tonic-gate g11n_locale2langtag(char *locale)
620Sstevel@tonic-gate {
632705Sjp161948 	char *langtag;
640Sstevel@tonic-gate 
652705Sjp161948 	/* base cases */
662705Sjp161948 	if (!locale || !*locale)
672705Sjp161948 		return (NULL);
680Sstevel@tonic-gate 
692705Sjp161948 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
702705Sjp161948 		return ("i-default");
710Sstevel@tonic-gate 
722705Sjp161948 	/* punt for language codes which are not exactly 2 letters */
732705Sjp161948 	if (strlen(locale) < 2 ||
742705Sjp161948 	    !isalpha(locale[0]) ||
752705Sjp161948 	    !isalpha(locale[1]) ||
762705Sjp161948 	    (locale[2] != '\0' &&
772705Sjp161948 	    locale[2] != '_' &&
782705Sjp161948 	    locale[2] != '.' &&
792705Sjp161948 	    locale[2] != '@'))
802705Sjp161948 		return (NULL);
810Sstevel@tonic-gate 
820Sstevel@tonic-gate 
832705Sjp161948 	/* we have a primary language sub-tag */
842705Sjp161948 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
850Sstevel@tonic-gate 
862705Sjp161948 	strncpy(langtag, locale, 2);
872705Sjp161948 	langtag[2] = '\0';
880Sstevel@tonic-gate 
892705Sjp161948 	/* do we have country sub-tag? For example: cs_CZ */
902705Sjp161948 	if (locale[2] == '_') {
912705Sjp161948 		if (strlen(locale) < 5 ||
922705Sjp161948 		    !isalpha(locale[3]) ||
932705Sjp161948 		    !isalpha(locale[4]) ||
942705Sjp161948 		    (locale[5] != '\0' && (locale[5] != '.' &&
952705Sjp161948 		    locale[5] != '@'))) {
962705Sjp161948 			return (langtag);
972705Sjp161948 		}
982705Sjp161948 
992705Sjp161948 		/* example: create cs-CZ from cs_CZ */
1002705Sjp161948 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
1012705Sjp161948 		    locale + 3) == 5)
1022705Sjp161948 			return (langtag);
1030Sstevel@tonic-gate 	}
1040Sstevel@tonic-gate 
1052705Sjp161948 	/* in all other cases we just use the primary language sub-tag */
1062705Sjp161948 	return (langtag);
1070Sstevel@tonic-gate }
1080Sstevel@tonic-gate 
1092705Sjp161948 uint_t
1100Sstevel@tonic-gate g11n_langtag_is_default(char *langtag)
1110Sstevel@tonic-gate {
1122705Sjp161948 	return (strcmp(langtag, "i-default") == 0);
1130Sstevel@tonic-gate }
1140Sstevel@tonic-gate 
1150Sstevel@tonic-gate /*
1160Sstevel@tonic-gate  * This lang tag / locale matching function works only for two-character
1170Sstevel@tonic-gate  * language primary sub-tags and two-character country sub-tags.
1180Sstevel@tonic-gate  */
1192705Sjp161948 uint_t
1200Sstevel@tonic-gate g11n_langtag_matches_locale(char *langtag, char *locale)
1210Sstevel@tonic-gate {
1222705Sjp161948 	/* match "i-default" to the process' current locale if possible */
1232705Sjp161948 	if (g11n_langtag_is_default(langtag)) {
1242705Sjp161948 		if (strcasecmp(locale, "POSIX") == 0 ||
1252705Sjp161948 		    strcasecmp(locale, "C") == 0)
1262705Sjp161948 			return (1);
1272705Sjp161948 		else
1282705Sjp161948 			return (0);
1292705Sjp161948 	}
1300Sstevel@tonic-gate 
1312705Sjp161948 	/*
1322705Sjp161948 	 * locale must be at least 2 chars long and the lang part must be
1332705Sjp161948 	 * exactly two characters
1342705Sjp161948 	 */
1352705Sjp161948 	if (strlen(locale) < 2 ||
1362705Sjp161948 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
1372705Sjp161948 	    (locale[2] != '\0' && locale[2] != '_' &&
1382705Sjp161948 	    locale[2] != '.' && locale[2] != '@')))
1392705Sjp161948 		return (0);
1400Sstevel@tonic-gate 
1412705Sjp161948 	/* same thing with the langtag */
1422705Sjp161948 	if (strlen(langtag) < 2 ||
1432705Sjp161948 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
1442705Sjp161948 	    (langtag[2] != '\0' && langtag[2] != '-')))
1452705Sjp161948 		return (0);
1460Sstevel@tonic-gate 
1472705Sjp161948 	/* primary language sub-tag and the locale's language part must match */
1482705Sjp161948 	if (strncasecmp(langtag, locale, 2) != 0)
1492705Sjp161948 		return (0);
1500Sstevel@tonic-gate 
1512705Sjp161948 	/*
1522705Sjp161948 	 * primary language sub-tag and the locale's language match, now
1532705Sjp161948 	 * fuzzy check country part
1542705Sjp161948 	 */
1550Sstevel@tonic-gate 
1562705Sjp161948 	/* neither langtag nor locale have more than one component */
1572705Sjp161948 	if (langtag[2] == '\0' &&
1582705Sjp161948 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
1592705Sjp161948 		return (2);
1600Sstevel@tonic-gate 
1612705Sjp161948 	/* langtag has only one sub-tag... */
1622705Sjp161948 	if (langtag[2] == '\0')
1632705Sjp161948 		return (1);
1640Sstevel@tonic-gate 
1652705Sjp161948 	/* locale has no country code... */
1662705Sjp161948 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
1672705Sjp161948 		return (1);
1682705Sjp161948 
1692705Sjp161948 	/* langtag has more than one subtag and the locale has a country code */
1700Sstevel@tonic-gate 
1712705Sjp161948 	/* ignore second subtag if not two chars */
1722705Sjp161948 	if (strlen(langtag) < 5)
1732705Sjp161948 		return (1);
1740Sstevel@tonic-gate 
1752705Sjp161948 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
1762705Sjp161948 	    (langtag[5] != '\0' && langtag[5] != '-'))
1772705Sjp161948 		return (1);
1780Sstevel@tonic-gate 
1792705Sjp161948 	/* ignore rest of locale if there is no two-character country part */
1802705Sjp161948 	if (strlen(locale) < 5)
1812705Sjp161948 		return (1);
1820Sstevel@tonic-gate 
1832705Sjp161948 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
1842705Sjp161948 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
1852705Sjp161948 		return (1);
1860Sstevel@tonic-gate 
1872705Sjp161948 	/* if the country part matches, return 2 */
1882705Sjp161948 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
1892705Sjp161948 		return (2);
1900Sstevel@tonic-gate 
1912705Sjp161948 	return (1);
1920Sstevel@tonic-gate }
1930Sstevel@tonic-gate 
1940Sstevel@tonic-gate char *
1950Sstevel@tonic-gate g11n_getlocale()
1960Sstevel@tonic-gate {
1972705Sjp161948 	/* we have one text domain - always set it */
1982705Sjp161948 	(void) textdomain(TEXT_DOMAIN);
1990Sstevel@tonic-gate 
2002705Sjp161948 	/* if the locale is not set, set it from the env vars */
2012705Sjp161948 	if (!setlocale(LC_MESSAGES, NULL))
2022705Sjp161948 		(void) setlocale(LC_MESSAGES, "");
2030Sstevel@tonic-gate 
2042705Sjp161948 	return (setlocale(LC_MESSAGES, NULL));
2050Sstevel@tonic-gate }
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate void
2080Sstevel@tonic-gate g11n_setlocale(int category, const char *locale)
2090Sstevel@tonic-gate {
2102705Sjp161948 	char *curr;
2110Sstevel@tonic-gate 
2122705Sjp161948 	/* we have one text domain - always set it */
2132705Sjp161948 	(void) textdomain(TEXT_DOMAIN);
2140Sstevel@tonic-gate 
2152705Sjp161948 	if (!locale)
2162705Sjp161948 		return;
2170Sstevel@tonic-gate 
2182705Sjp161948 	if (*locale && ((curr = setlocale(category, NULL))) &&
2192705Sjp161948 	    strcmp(curr, locale) == 0)
2202705Sjp161948 		return;
2212628Sjp161948 
2222705Sjp161948 	/* if <category> is bogus, setlocale() will do nothing */
2232705Sjp161948 	(void) setlocale(category, locale);
2240Sstevel@tonic-gate }
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate char **
2270Sstevel@tonic-gate g11n_getlocales()
2280Sstevel@tonic-gate {
2292705Sjp161948 	FILE *locale_out;
2302705Sjp161948 	uint_t n_elems, list_size, long_line = 0;
2312705Sjp161948 	char **list;
2322705Sjp161948 	char locale[64];	/* 64 bytes is plenty for locale names */
2332705Sjp161948 
2342705Sjp161948 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
2352705Sjp161948 		return (NULL);
2360Sstevel@tonic-gate 
2372705Sjp161948 	/*
2382705Sjp161948 	 * start with enough room for 65 locales - that's a lot fewer than
2392705Sjp161948 	 * all the locales available for installation, but a lot more than
2402705Sjp161948 	 * what most users will need and install
2412705Sjp161948 	 */
2422705Sjp161948 	n_elems = 0;
2432705Sjp161948 	list_size = 192;
2442705Sjp161948 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
2452705Sjp161948 	memset(list, 0, sizeof (char *) * (list_size + 1));
2460Sstevel@tonic-gate 
2472705Sjp161948 	while (fgets(locale, sizeof (locale), locale_out)) {
2482705Sjp161948 		/* skip long locale names (if any) */
2492705Sjp161948 		if (!strchr(locale, '\n')) {
2502705Sjp161948 			long_line = 1;
2512705Sjp161948 			continue;
2522705Sjp161948 		} else if (long_line) {
2532705Sjp161948 			long_line = 0;
2542705Sjp161948 			continue;
2552705Sjp161948 		}
2560Sstevel@tonic-gate 
2572705Sjp161948 		if (strncmp(locale, "iso_8859", 8) == 0)
2582705Sjp161948 			/* ignore locale names like "iso_8859-1" */
2592705Sjp161948 			continue;
2600Sstevel@tonic-gate 
2612705Sjp161948 		if (n_elems == list_size) {
2622705Sjp161948 			list_size *= 2;
2632705Sjp161948 			list = (char **)xrealloc((void *) list,
2642705Sjp161948 			    (list_size + 1) * sizeof (char *));
2652705Sjp161948 			memset(&list[n_elems + 1], 0,
2662705Sjp161948 			    sizeof (char *) * (list_size - n_elems + 1));
2672705Sjp161948 		}
2682705Sjp161948 
2692705Sjp161948 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
2702705Sjp161948 		list[n_elems++] = xstrdup(locale);
2710Sstevel@tonic-gate 	}
2720Sstevel@tonic-gate 
273*3109Sjp161948 	if (n_elems == 0)
274*3109Sjp161948 		return (NULL);
275*3109Sjp161948 
2762705Sjp161948 	list[n_elems] = NULL;
2772705Sjp161948 	(void) pclose(locale_out);
2780Sstevel@tonic-gate 
2792705Sjp161948 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
2802705Sjp161948 	return (list);
2810Sstevel@tonic-gate }
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate char *
2840Sstevel@tonic-gate g11n_getlangs()
2850Sstevel@tonic-gate {
2862705Sjp161948 	char *locale;
2870Sstevel@tonic-gate 
2882705Sjp161948 	if (getenv("SSH_LANGS"))
2892705Sjp161948 		return (xstrdup(getenv("SSH_LANGS")));
2900Sstevel@tonic-gate 
2912705Sjp161948 	locale = g11n_getlocale();
2920Sstevel@tonic-gate 
2932705Sjp161948 	if (!locale || !*locale)
2942705Sjp161948 		return (xstrdup("i-default"));
2950Sstevel@tonic-gate 
2962705Sjp161948 	return (g11n_locale2langtag(locale));
2970Sstevel@tonic-gate }
2980Sstevel@tonic-gate 
2990Sstevel@tonic-gate char *
3000Sstevel@tonic-gate g11n_locales2langs(char **locale_set)
3010Sstevel@tonic-gate {
3022705Sjp161948 	char **p, **r, **q;
3032705Sjp161948 	char *langtag;
3042705Sjp161948 	int locales, skip;
3050Sstevel@tonic-gate 
3062705Sjp161948 	for (locales = 0, p = locale_set; p && *p; p++)
3072705Sjp161948 		locales++;
3080Sstevel@tonic-gate 
3092705Sjp161948 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
3102705Sjp161948 	memset(r, 0, (locales + 1) * sizeof (char *));
3110Sstevel@tonic-gate 
3122705Sjp161948 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
3132705Sjp161948 		skip = 0;
3142705Sjp161948 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
3152705Sjp161948 			continue;
3162705Sjp161948 		for (q = r; (q - r) < locales; q++) {
3172705Sjp161948 			if (!*q)
3182705Sjp161948 				break;
3192705Sjp161948 			if (*q && strcmp(*q, langtag) == 0)
3202705Sjp161948 				skip = 1;
3212705Sjp161948 		}
3222705Sjp161948 		if (!skip)
3232705Sjp161948 			*(q++) = langtag;
3242705Sjp161948 		*q = NULL;
3250Sstevel@tonic-gate 	}
3262705Sjp161948 
3272705Sjp161948 	return (xjoin(r, ','));
3280Sstevel@tonic-gate }
3290Sstevel@tonic-gate 
3302705Sjp161948 static int
3310Sstevel@tonic-gate sortcmp(const void *d1, const void *d2)
3320Sstevel@tonic-gate {
3332705Sjp161948 	char *s1 = *(char **)d1;
3342705Sjp161948 	char *s2 = *(char **)d2;
3350Sstevel@tonic-gate 
3362705Sjp161948 	return (strcmp(s1, s2));
3370Sstevel@tonic-gate }
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate int
3400Sstevel@tonic-gate g11n_langtag_match(char *langtag1, char *langtag2)
3410Sstevel@tonic-gate {
3422705Sjp161948 	int len1, len2;
3432705Sjp161948 	char c1, c2;
3440Sstevel@tonic-gate 
3452705Sjp161948 	len1 = (strchr(langtag1, '-')) ?
3460Sstevel@tonic-gate 		(strchr(langtag1, '-') - langtag1)
3470Sstevel@tonic-gate 		: strlen(langtag1);
3480Sstevel@tonic-gate 
3492705Sjp161948 	len2 = (strchr(langtag2, '-')) ?
3500Sstevel@tonic-gate 		(strchr(langtag2, '-') - langtag2)
3510Sstevel@tonic-gate 		: strlen(langtag2);
3520Sstevel@tonic-gate 
3532705Sjp161948 	/* no match */
3542705Sjp161948 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
3552705Sjp161948 		return (0);
3560Sstevel@tonic-gate 
3572705Sjp161948 	c1 = *(langtag1 + len1);
3582705Sjp161948 	c2 = *(langtag2 + len2);
3590Sstevel@tonic-gate 
3602705Sjp161948 	/* no country sub-tags - exact match */
3612705Sjp161948 	if (c1 == '\0' && c2 == '\0')
3622705Sjp161948 		return (2);
3630Sstevel@tonic-gate 
3642705Sjp161948 	/* one langtag has a country sub-tag, the other doesn't */
3652705Sjp161948 	if (c1 == '\0' || c2 == '\0')
3662705Sjp161948 		return (1);
3670Sstevel@tonic-gate 
3682705Sjp161948 	/* can't happen - both langtags have a country sub-tag */
3692705Sjp161948 	if (c1 != '-' || c2 != '-')
3702705Sjp161948 		return (1);
3710Sstevel@tonic-gate 
3722705Sjp161948 	/* compare country subtags */
3732705Sjp161948 	langtag1 = langtag1 + len1 + 1;
3742705Sjp161948 	langtag2 = langtag2 + len2 + 1;
3750Sstevel@tonic-gate 
3762705Sjp161948 	len1 = (strchr(langtag1, '-')) ?
3772705Sjp161948 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
3782705Sjp161948 
3792705Sjp161948 	len2 = (strchr(langtag2, '-')) ?
3802705Sjp161948 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
3810Sstevel@tonic-gate 
3822705Sjp161948 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
3832705Sjp161948 		return (1);
3840Sstevel@tonic-gate 
3852705Sjp161948 	/* country tags matched - exact match */
3862705Sjp161948 	return (2);
3870Sstevel@tonic-gate }
3880Sstevel@tonic-gate 
3890Sstevel@tonic-gate char *
3900Sstevel@tonic-gate g11n_langtag_set_intersect(char *set1, char *set2)
3910Sstevel@tonic-gate {
3922705Sjp161948 	char **list1, **list2, **list3, **p, **q, **r;
3932705Sjp161948 	char *set3, *lang_subtag;
3942705Sjp161948 	uint_t n1, n2, n3;
3952705Sjp161948 	uint_t do_append;
3962705Sjp161948 
3972705Sjp161948 	list1 = xsplit(set1, ',');
3982705Sjp161948 	list2 = xsplit(set2, ',');
3990Sstevel@tonic-gate 
4002705Sjp161948 	for (n1 = 0, p = list1; p && *p; p++, n1++)
4012705Sjp161948 		;
4022705Sjp161948 	for (n2 = 0, p = list2; p && *p; p++, n2++)
4032705Sjp161948 		;
4040Sstevel@tonic-gate 
4052705Sjp161948 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
4062705Sjp161948 	*list3 = NULL;
4070Sstevel@tonic-gate 
4082705Sjp161948 	/*
4092705Sjp161948 	 * we must not sort the user langtags - sorting or not the server's
4102705Sjp161948 	 * should not affect the outcome
4112705Sjp161948 	 */
4122705Sjp161948 	qsort(list2, n2, sizeof (char *), sortcmp);
4130Sstevel@tonic-gate 
4142705Sjp161948 	for (n3 = 0, p = list1; p && *p; p++) {
4152705Sjp161948 		do_append = 0;
4162705Sjp161948 		for (q = list2; q && *q; q++) {
4172705Sjp161948 			if (g11n_langtag_match(*p, *q) != 2) continue;
4182705Sjp161948 			/* append element */
4192705Sjp161948 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
4202705Sjp161948 				do_append = 1;
4212705Sjp161948 				if (!*r)
4222705Sjp161948 					break;
4232705Sjp161948 				if (strcmp(*p, *r) == 0) {
4242705Sjp161948 					do_append = 0;
4252705Sjp161948 					break;
4262705Sjp161948 				}
4272705Sjp161948 			}
4282705Sjp161948 			if (do_append && n3 <= (n1 + n2)) {
4292705Sjp161948 				list3[n3++] = xstrdup(*p);
4302705Sjp161948 				list3[n3] = NULL;
4312705Sjp161948 			}
4320Sstevel@tonic-gate 		}
4330Sstevel@tonic-gate 	}
4342705Sjp161948 
4352705Sjp161948 	for (p = list1; p && *p; p++) {
4362705Sjp161948 		do_append = 0;
4372705Sjp161948 		for (q = list2; q && *q; q++) {
4382705Sjp161948 			if (g11n_langtag_match(*p, *q) != 1)
4392705Sjp161948 				continue;
4400Sstevel@tonic-gate 
4412705Sjp161948 			/* append element */
4422705Sjp161948 			lang_subtag = xstrdup(*p);
4432705Sjp161948 			if (strchr(lang_subtag, '-'))
4442705Sjp161948 				*(strchr(lang_subtag, '-')) = '\0';
4452705Sjp161948 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
4462705Sjp161948 				do_append = 1;
4472705Sjp161948 				if (!*r)
4482705Sjp161948 					break;
4492705Sjp161948 				if (strcmp(lang_subtag, *r) == 0) {
4502705Sjp161948 					do_append = 0;
4512705Sjp161948 					break;
4522705Sjp161948 				}
4532705Sjp161948 			}
4542705Sjp161948 			if (do_append && n3 <= (n1 + n2)) {
4552705Sjp161948 				list3[n3++] = lang_subtag;
4562705Sjp161948 				list3[n3] = NULL;
4572705Sjp161948 			} else
4582705Sjp161948 				xfree(lang_subtag);
4590Sstevel@tonic-gate 		}
4600Sstevel@tonic-gate 	}
4610Sstevel@tonic-gate 
4622705Sjp161948 	set3 = xjoin(list3, ',');
4632705Sjp161948 	xfree_split_list(list1);
4642705Sjp161948 	xfree_split_list(list2);
4652705Sjp161948 	xfree_split_list(list3);
4660Sstevel@tonic-gate 
4672705Sjp161948 	return (set3);
4680Sstevel@tonic-gate }
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate char *
4710Sstevel@tonic-gate g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
4720Sstevel@tonic-gate {
4732705Sjp161948 	char *list, *result;
4742705Sjp161948 	char **xlist;
4750Sstevel@tonic-gate 
4762705Sjp161948 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
4772705Sjp161948 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
4780Sstevel@tonic-gate 
4792705Sjp161948 	if (!list)
4802705Sjp161948 		return (NULL);
4810Sstevel@tonic-gate 
4822705Sjp161948 	xlist = xsplit(list, ',');
4830Sstevel@tonic-gate 
4842705Sjp161948 	xfree(list);
4850Sstevel@tonic-gate 
4862705Sjp161948 	if (!xlist || !*xlist)
4872705Sjp161948 		return (NULL);
4880Sstevel@tonic-gate 
4892705Sjp161948 	result = xstrdup(*xlist);
4902705Sjp161948 	xfree_split_list(xlist);
4910Sstevel@tonic-gate 
4922705Sjp161948 	return (result);
4930Sstevel@tonic-gate }
4940Sstevel@tonic-gate 
4950Sstevel@tonic-gate /*
4960Sstevel@tonic-gate  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
4970Sstevel@tonic-gate  * a stright strcmp()
4980Sstevel@tonic-gate  */
4992705Sjp161948 static int
5000Sstevel@tonic-gate locale_cmp(const void *d1, const void *d2)
5010Sstevel@tonic-gate {
5022705Sjp161948 	char *dot_ptr;
5032705Sjp161948 	char *s1 = *(char **)d1;
5042705Sjp161948 	char *s2 = *(char **)d2;
5052705Sjp161948 	int s1_is_utf8 = 0;
5062705Sjp161948 	int s2_is_utf8 = 0;
5070Sstevel@tonic-gate 
5082705Sjp161948 	/* check if s1 is a UTF-8 locale */
5092705Sjp161948 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
5102705Sjp161948 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
5112705Sjp161948 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
5122705Sjp161948 		s1_is_utf8++;
5132705Sjp161948 	}
5142705Sjp161948 
5152705Sjp161948 	/* check if s2 is a UTF-8 locale */
5162705Sjp161948 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
5172705Sjp161948 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
5182705Sjp161948 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
5192705Sjp161948 		s2_is_utf8++;
5202705Sjp161948 	}
5210Sstevel@tonic-gate 
5222705Sjp161948 	/* prefer UTF-8 locales */
5232705Sjp161948 	if (s1_is_utf8 && !s2_is_utf8)
5242705Sjp161948 		return (-1);
5250Sstevel@tonic-gate 
5262705Sjp161948 	if (s2_is_utf8 && !s1_is_utf8)
5272705Sjp161948 		return (1);
5280Sstevel@tonic-gate 
5292705Sjp161948 	/* prefer any locale over the default locales */
5302705Sjp161948 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
5312705Sjp161948 	    strcmp(s1, "common") == 0) {
5322705Sjp161948 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
5332705Sjp161948 		    strcmp(s2, "common") != 0)
5342705Sjp161948 			return (1);
5352705Sjp161948 	}
5360Sstevel@tonic-gate 
5372705Sjp161948 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
5382705Sjp161948 	    strcmp(s2, "common") == 0) {
5392705Sjp161948 		if (strcmp(s1, "C") != 0 &&
5402705Sjp161948 		    strcmp(s1, "POSIX") != 0 &&
5412705Sjp161948 		    strcmp(s1, "common") != 0)
5422705Sjp161948 			return (-1);
5432705Sjp161948 	}
5440Sstevel@tonic-gate 
5452705Sjp161948 	return (strcmp(s1, s2));
5460Sstevel@tonic-gate }
5470Sstevel@tonic-gate 
5480Sstevel@tonic-gate 
5490Sstevel@tonic-gate char **
5502705Sjp161948 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
5510Sstevel@tonic-gate {
5522705Sjp161948 	char **langtag_list, **result, **p, **q, **r;
5532705Sjp161948 	char *s;
5542705Sjp161948 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
5552705Sjp161948 
5562705Sjp161948 	/* count lang tags and locales */
5572705Sjp161948 	for (n_locales = 0, p = locale_set; p && *p; p++)
5582705Sjp161948 		n_locales++;
5590Sstevel@tonic-gate 
5602705Sjp161948 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
5612705Sjp161948 	/* count the number of langtags */
5622705Sjp161948 	for (; s = strchr(s, ','); s++, n_langtags++)
5632705Sjp161948 		;
5642705Sjp161948 
5652705Sjp161948 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
5660Sstevel@tonic-gate 
5672705Sjp161948 	langtag_list = xsplit(langtag_set, ',');
5682705Sjp161948 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
5692705Sjp161948 		;
5700Sstevel@tonic-gate 
5712705Sjp161948 	max_results = MIN(n_locales, n_langtags) * 2;
5722705Sjp161948 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
5732705Sjp161948 	*result = NULL;
5742705Sjp161948 	n_results = 0;
5750Sstevel@tonic-gate 
5762705Sjp161948 	/* more specific matches first */
5772705Sjp161948 	for (p = langtag_list; p && *p; p++) {
5782705Sjp161948 		do_append = 0;
5792705Sjp161948 		for (q = locale_set; q && *q; q++) {
5802705Sjp161948 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
5812705Sjp161948 				do_append = 1;
5822705Sjp161948 				for (r = result; (r - result) <=
5832705Sjp161948 				    MIN(n_locales, n_langtags); r++) {
5842705Sjp161948 					if (!*r)
5852705Sjp161948 						break;
5862705Sjp161948 					if (strcmp(*q, *r) == 0) {
5872705Sjp161948 						do_append = 0;
5882705Sjp161948 						break;
5892705Sjp161948 					}
5902705Sjp161948 				}
5912705Sjp161948 				if (do_append && n_results < max_results) {
5922705Sjp161948 					result[n_results++] = xstrdup(*q);
5932705Sjp161948 					result[n_results] = NULL;
5942705Sjp161948 				}
5952705Sjp161948 				break;
5962705Sjp161948 			}
5970Sstevel@tonic-gate 		}
5980Sstevel@tonic-gate 	}
5990Sstevel@tonic-gate 
6002705Sjp161948 	for (p = langtag_list; p && *p; p++) {
6012705Sjp161948 		do_append = 0;
6022705Sjp161948 		for (q = locale_set; q && *q; q++) {
6032705Sjp161948 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
6042705Sjp161948 				do_append = 1;
6052705Sjp161948 				for (r = result; (r - result) <=
6062705Sjp161948 				    MIN(n_locales, n_langtags); r++) {
6072705Sjp161948 					if (!*r)
6082705Sjp161948 						break;
6092705Sjp161948 					if (strcmp(*q, *r) == 0) {
6102705Sjp161948 						do_append = 0;
6112705Sjp161948 						break;
6122705Sjp161948 					}
6132705Sjp161948 				}
6142705Sjp161948 				if (do_append && n_results < max_results) {
6152705Sjp161948 					result[n_results++] = xstrdup(*q);
6162705Sjp161948 					result[n_results] = NULL;
6172705Sjp161948 				}
6182705Sjp161948 				break;
6192705Sjp161948 			}
6200Sstevel@tonic-gate 		}
6210Sstevel@tonic-gate 	}
6220Sstevel@tonic-gate 
6232705Sjp161948 	xfree_split_list(langtag_list);
6242705Sjp161948 
6252705Sjp161948 	return (result);
6260Sstevel@tonic-gate }
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate char *
6290Sstevel@tonic-gate g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
6300Sstevel@tonic-gate {
6312705Sjp161948 	char **results, *result = NULL;
6320Sstevel@tonic-gate 
6332705Sjp161948 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
6340Sstevel@tonic-gate 	    srvr_locales ?  srvr_locales : g11n_getlocales())) == NULL)
6352705Sjp161948 		return (NULL);
6360Sstevel@tonic-gate 
6372705Sjp161948 	if (*results != NULL)
6382705Sjp161948 		result = xstrdup(*results);
6390Sstevel@tonic-gate 
6402705Sjp161948 	xfree_split_list(results);
6410Sstevel@tonic-gate 
6422705Sjp161948 	return (result);
6430Sstevel@tonic-gate }
6440Sstevel@tonic-gate 
6450Sstevel@tonic-gate 
6460Sstevel@tonic-gate /*
6470Sstevel@tonic-gate  * Functions for validating ASCII and UTF-8 strings
6480Sstevel@tonic-gate  *
6490Sstevel@tonic-gate  * The error_str parameter is an optional pointer to a char variable
6500Sstevel@tonic-gate  * where to store a string suitable for use with error() or fatal() or
6510Sstevel@tonic-gate  * friends.
6520Sstevel@tonic-gate  *
6530Sstevel@tonic-gate  * The return value is 0 if success, EILSEQ or EINVAL.
6540Sstevel@tonic-gate  *
6550Sstevel@tonic-gate  */
6562705Sjp161948 uint_t
6572705Sjp161948 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
6580Sstevel@tonic-gate {
6592705Sjp161948 	uchar_t *p;
6600Sstevel@tonic-gate 
6612705Sjp161948 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
6622705Sjp161948 		;
6630Sstevel@tonic-gate 
6642705Sjp161948 	if (len && ((p - (uchar_t *)str) != len))
6652705Sjp161948 		return (EILSEQ);
6662705Sjp161948 
6672705Sjp161948 	return (0);
6680Sstevel@tonic-gate }
6690Sstevel@tonic-gate 
6702705Sjp161948 uint_t
6712705Sjp161948 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
6720Sstevel@tonic-gate {
6732705Sjp161948 	uchar_t *p;
6742705Sjp161948 	uint_t c, l;
6752705Sjp161948 
6762705Sjp161948 	if (len == 0)
6772705Sjp161948 		len = strlen((const char *)str);
6780Sstevel@tonic-gate 
6792705Sjp161948 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
6802705Sjp161948 		/* 8-bit chars begin a UTF-8 sequence */
6812705Sjp161948 		if (*p & 0x80) {
6822705Sjp161948 			/* get sequence length and sanity check first byte */
6832705Sjp161948 			if (*p < 0xc0)
6842705Sjp161948 				return (EILSEQ);
6852705Sjp161948 			else if (*p < 0xe0)
6862705Sjp161948 				l = 2;
6872705Sjp161948 			else if (*p < 0xf0)
6882705Sjp161948 				l = 3;
6892705Sjp161948 			else if (*p < 0xf8)
6902705Sjp161948 				l = 4;
6912705Sjp161948 			else if (*p < 0xfc)
6922705Sjp161948 				l = 5;
6932705Sjp161948 			else if (*p < 0xfe)
6942705Sjp161948 				l = 6;
6952705Sjp161948 			else
6962705Sjp161948 				return (EILSEQ);
6972705Sjp161948 
6982705Sjp161948 			if ((p + l - str) >= len)
6992705Sjp161948 				return (EILSEQ);
7002705Sjp161948 
7012705Sjp161948 			/* overlong detection - build codepoint */
7022705Sjp161948 			c = *p & 0x3f;
7032705Sjp161948 			/* shift c bits from first byte */
7042705Sjp161948 			c = c << (6 * (l - 1));
7052705Sjp161948 
7062705Sjp161948 			if (l > 1) {
7072705Sjp161948 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
7082705Sjp161948 					c = c | ((*(p + 1) & 0x3f) <<
7092705Sjp161948 					    (6 * (l - 2)));
7102705Sjp161948 				else
7112705Sjp161948 					return (EILSEQ);
7122705Sjp161948 
7132705Sjp161948 				if (c < 0x80)
7142705Sjp161948 					return (EILSEQ);
7152705Sjp161948 			}
7160Sstevel@tonic-gate 
7172705Sjp161948 			if (l > 2) {
7182705Sjp161948 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
7192705Sjp161948 					c = c | ((*(p + 2) & 0x3f) <<
7202705Sjp161948 					    (6 * (l - 3)));
7212705Sjp161948 				else
7222705Sjp161948 					return (EILSEQ);
7232705Sjp161948 
7242705Sjp161948 				if (c < 0x800)
7252705Sjp161948 					return (EILSEQ);
7262705Sjp161948 			}
7272705Sjp161948 
7282705Sjp161948 			if (l > 3) {
7292705Sjp161948 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
7302705Sjp161948 					c = c | ((*(p + 3) & 0x3f) <<
7312705Sjp161948 					    (6 * (l - 4)));
7322705Sjp161948 				else
7332705Sjp161948 					return (EILSEQ);
7342705Sjp161948 
7352705Sjp161948 				if (c < 0x10000)
7362705Sjp161948 					return (EILSEQ);
7372705Sjp161948 			}
7380Sstevel@tonic-gate 
7392705Sjp161948 			if (l > 4) {
7402705Sjp161948 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
7412705Sjp161948 					c = c | ((*(p + 4) & 0x3f) <<
7422705Sjp161948 					    (6 * (l - 5)));
7432705Sjp161948 				else
7442705Sjp161948 					return (EILSEQ);
7452705Sjp161948 
7462705Sjp161948 				if (c < 0x200000)
7472705Sjp161948 					return (EILSEQ);
7482705Sjp161948 			}
7490Sstevel@tonic-gate 
7502705Sjp161948 			if (l > 5) {
7512705Sjp161948 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
7522705Sjp161948 					c = c | (*(p + 5) & 0x3f);
7532705Sjp161948 				else
7542705Sjp161948 					return (EILSEQ);
7552705Sjp161948 
7562705Sjp161948 				if (c < 0x4000000)
7572705Sjp161948 					return (EILSEQ);
7582705Sjp161948 			}
7590Sstevel@tonic-gate 
7602705Sjp161948 			/*
7612705Sjp161948 			 * check for UTF-16 surrogates ifs other illegal
7622705Sjp161948 			 * UTF-8 * points
7632705Sjp161948 			 */
7642705Sjp161948 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
7652705Sjp161948 			    (c == 0xfffe) || (c == 0xffff))
7662705Sjp161948 				return (EILSEQ);
7672705Sjp161948 			p += l;
7682705Sjp161948 		}
7692705Sjp161948 		/* 7-bit chars are fine */
7700Sstevel@tonic-gate 		else
7712705Sjp161948 			p++;
7720Sstevel@tonic-gate 	}
7732705Sjp161948 	return (0);
7740Sstevel@tonic-gate }
7750Sstevel@tonic-gate 
7760Sstevel@tonic-gate /*
7770Sstevel@tonic-gate  * Functions for converting to ASCII or UTF-8 from the local codeset
7780Sstevel@tonic-gate  * Functions for converting from ASCII or UTF-8 to the local codeset
7790Sstevel@tonic-gate  *
7800Sstevel@tonic-gate  * The error_str parameter is an optional pointer to a char variable
7810Sstevel@tonic-gate  * where to store a string suitable for use with error() or fatal() or
7820Sstevel@tonic-gate  * friends.
7830Sstevel@tonic-gate  *
7840Sstevel@tonic-gate  * The err parameter is an optional pointer to an integer where 0
7850Sstevel@tonic-gate  * (success) or EILSEQ or EINVAL will be stored (failure).
7860Sstevel@tonic-gate  *
7870Sstevel@tonic-gate  * These functions return NULL if the conversion fails.
7880Sstevel@tonic-gate  *
7890Sstevel@tonic-gate  */
7902705Sjp161948 uchar_t *
7912705Sjp161948 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
7920Sstevel@tonic-gate {
7932705Sjp161948 	static uint_t initialized = 0;
7942705Sjp161948 	static uint_t do_convert = 0;
7952705Sjp161948 	iconv_t cd;
7962705Sjp161948 	int err;
7970Sstevel@tonic-gate 
7982705Sjp161948 	if (!initialized) {
7992705Sjp161948 		/*
8002705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
8012705Sjp161948 		 * same, and there are aliases of codesets to boot...
8022705Sjp161948 		 */
8032705Sjp161948 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
8042705Sjp161948 			strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
8052705Sjp161948 			strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
8062705Sjp161948 			initialized = 1;
8072705Sjp161948 			do_convert = 0;
8082705Sjp161948 		} else {
8092705Sjp161948 			cd = iconv_open(nl_langinfo(CODESET), "646");
8102705Sjp161948 			if (cd == (iconv_t)-1) {
8112705Sjp161948 				if (err_ptr)
8122705Sjp161948 					*err_ptr = errno;
8132705Sjp161948 				if (error_str)
8142705Sjp161948 					*error_str = (uchar_t *)"Cannot "
8152705Sjp161948 					    "convert ASCII strings to the local"
8162705Sjp161948 					    " codeset";
8172705Sjp161948 			}
8182705Sjp161948 			initialized = 1;
8192705Sjp161948 			do_convert = 1;
8202705Sjp161948 		}
8210Sstevel@tonic-gate 	}
8222705Sjp161948 
8232705Sjp161948 	if (!do_convert) {
8242705Sjp161948 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
8252705Sjp161948 			if (err_ptr)
8262705Sjp161948 				*err_ptr = err;
8272705Sjp161948 			return (NULL);
8282705Sjp161948 		} else
8292705Sjp161948 			return ((uchar_t *)xstrdup(str));
8300Sstevel@tonic-gate 	}
8310Sstevel@tonic-gate 
8322705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
8330Sstevel@tonic-gate }
8340Sstevel@tonic-gate 
8352705Sjp161948 uchar_t *
8362705Sjp161948 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
8370Sstevel@tonic-gate {
8382705Sjp161948 	static uint_t initialized = 0;
8392705Sjp161948 	static uint_t do_convert = 0;
8402705Sjp161948 	iconv_t cd;
8412705Sjp161948 	int err;
8420Sstevel@tonic-gate 
8432705Sjp161948 	if (!initialized) {
8442705Sjp161948 		/*
8452705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
8462705Sjp161948 		 * same, and there are aliases of codesets to boot...
8472705Sjp161948 		 */
8482705Sjp161948 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
8492705Sjp161948 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
8502705Sjp161948 			initialized = 1;
8512705Sjp161948 			do_convert = 0;
8522705Sjp161948 		} else {
8532705Sjp161948 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
8542705Sjp161948 			if (cd == (iconv_t)-1) {
8552705Sjp161948 				if (err_ptr)
8562705Sjp161948 					*err_ptr = errno;
8572705Sjp161948 				if (error_str)
8582705Sjp161948 					*error_str = (uchar_t *)"Cannot "
8592705Sjp161948 					    "convert UTF-8 strings to the "
8602705Sjp161948 					    "local codeset";
8612705Sjp161948 			}
8622705Sjp161948 			initialized = 1;
8632705Sjp161948 			do_convert = 1;
8642705Sjp161948 		}
8650Sstevel@tonic-gate 	}
8662705Sjp161948 
8672705Sjp161948 	if (!do_convert) {
8682705Sjp161948 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
8692705Sjp161948 			if (err_ptr)
8702705Sjp161948 				*err_ptr = err;
8712705Sjp161948 			return (NULL);
8722705Sjp161948 		} else
8732705Sjp161948 			return ((uchar_t *)xstrdup((char *)str));
8740Sstevel@tonic-gate 	}
8750Sstevel@tonic-gate 
8762705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
8770Sstevel@tonic-gate }
8780Sstevel@tonic-gate 
8790Sstevel@tonic-gate char *
8802705Sjp161948 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
8810Sstevel@tonic-gate {
8822705Sjp161948 	static uint_t initialized = 0;
8832705Sjp161948 	static uint_t do_convert = 0;
8842705Sjp161948 	iconv_t cd;
8850Sstevel@tonic-gate 
8862705Sjp161948 	if (!initialized) {
8872705Sjp161948 		/*
8882705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
8892705Sjp161948 		 * same, and there are aliases of codesets to boot...
8902705Sjp161948 		 */
8912705Sjp161948 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
8922705Sjp161948 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
8932705Sjp161948 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
8942705Sjp161948 			initialized = 1;
8952705Sjp161948 			do_convert = 0;
8962705Sjp161948 		} else {
8972705Sjp161948 			cd = iconv_open("646", nl_langinfo(CODESET));
8982705Sjp161948 			if (cd == (iconv_t)-1) {
8992705Sjp161948 				if (err_ptr)
9002705Sjp161948 					*err_ptr = errno;
9012705Sjp161948 				if (error_str)
9022705Sjp161948 					*error_str = (uchar_t *)"Cannot "
9032705Sjp161948 					    "convert UTF-8 strings to the "
9042705Sjp161948 					    "local codeset";
9052705Sjp161948 			}
9062705Sjp161948 			initialized = 1;
9072705Sjp161948 			do_convert = 1;
9082705Sjp161948 		}
9090Sstevel@tonic-gate 	}
9100Sstevel@tonic-gate 
9112705Sjp161948 	if (!do_convert)
9122705Sjp161948 		return (xstrdup((char *)str));
9132705Sjp161948 
9142705Sjp161948 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
9150Sstevel@tonic-gate }
9160Sstevel@tonic-gate 
9172705Sjp161948 uchar_t *
9182705Sjp161948 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
9190Sstevel@tonic-gate {
9202705Sjp161948 	static uint_t initialized = 0;
9212705Sjp161948 	static uint_t do_convert = 0;
9222705Sjp161948 	iconv_t cd;
9230Sstevel@tonic-gate 
9242705Sjp161948 	if (!initialized) {
9252705Sjp161948 		/*
9262705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
9272705Sjp161948 		 * same, and there are aliases of codesets to boot...
9282705Sjp161948 		 */
9292705Sjp161948 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
9302705Sjp161948 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
9312705Sjp161948 			initialized = 1;
9322705Sjp161948 			do_convert = 0;
9332705Sjp161948 		} else {
9342705Sjp161948 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
9352705Sjp161948 			if (cd == (iconv_t)-1) {
9362705Sjp161948 				if (err_ptr)
9372705Sjp161948 					*err_ptr = errno;
9382705Sjp161948 				if (error_str)
9392705Sjp161948 					*error_str = (uchar_t *)"Cannot "
9402705Sjp161948 					    "convert UTF-8 strings to the "
9412705Sjp161948 					    "local codeset";
9422705Sjp161948 			}
9432705Sjp161948 			initialized = 1;
9442705Sjp161948 			do_convert = 1;
9452705Sjp161948 		}
9460Sstevel@tonic-gate 	}
9470Sstevel@tonic-gate 
9482705Sjp161948 	if (!do_convert)
9492705Sjp161948 		return ((uchar_t *)xstrdup((char *)str));
9502705Sjp161948 
9512705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
9520Sstevel@tonic-gate }
9530Sstevel@tonic-gate 
9540Sstevel@tonic-gate 
9550Sstevel@tonic-gate /*
9560Sstevel@tonic-gate  * Wrapper around iconv()
9570Sstevel@tonic-gate  *
9580Sstevel@tonic-gate  * The caller is responsible for freeing the result and for handling
9590Sstevel@tonic-gate  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
9600Sstevel@tonic-gate  */
9612705Sjp161948 static uchar_t *
9622705Sjp161948 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
9632705Sjp161948     uint_t *outlen, int *err, uchar_t **err_str)
9642705Sjp161948 {
9652705Sjp161948 	size_t inbytesleft, outbytesleft, converted_size;
9662705Sjp161948 	char *outbuf;
9672705Sjp161948 	uchar_t *converted;
9682705Sjp161948 	const char *inbuf;
9692705Sjp161948 	uint_t mul = 0;
9700Sstevel@tonic-gate 
9712705Sjp161948 	if (!buf || !(*(char *)buf))
9722705Sjp161948 		return (NULL);
9732705Sjp161948 
9742705Sjp161948 	if (len == 0)
9752705Sjp161948 		len = strlen(buf);
9762705Sjp161948 
9772705Sjp161948 	/* reset conversion descriptor */
9782705Sjp161948 	/* XXX Do we need initial shift sequences for UTF-8??? */
9792705Sjp161948 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
9802705Sjp161948 	inbuf = (const char *) buf;
9812705Sjp161948 
9822705Sjp161948 	if (mul_ptr)
9832705Sjp161948 		mul = *mul_ptr;
9842705Sjp161948 
9852705Sjp161948 	converted_size = (len << mul);
9862705Sjp161948 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
9872705Sjp161948 	converted = (uchar_t *)outbuf;
9882705Sjp161948 	outbytesleft = len;
9890Sstevel@tonic-gate 
9902705Sjp161948 	do {
9912705Sjp161948 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
9922705Sjp161948 		    (size_t)-1) {
9932705Sjp161948 			if (errno == E2BIG) {
9942705Sjp161948 				/* UTF-8 codepoints are at most 8 bytes long */
9952705Sjp161948 				if (mul > 2) {
9962705Sjp161948 					if (err_str)
9972705Sjp161948 						*err_str = (uchar_t *)
9982705Sjp161948 						    "Conversion to UTF-8 failed"
9992705Sjp161948 						    " due to preposterous space"
10002705Sjp161948 						    " requirements";
10012705Sjp161948 					if (err)
10022705Sjp161948 						*err = EILSEQ;
10032705Sjp161948 					return (NULL);
10042705Sjp161948 				}
10050Sstevel@tonic-gate 
10062705Sjp161948 				/*
10072705Sjp161948 				 * re-alloc output and ensure that the outbuf
10082705Sjp161948 				 * and outbytesleft values are adjusted
10092705Sjp161948 				 */
10102705Sjp161948 				converted = xrealloc(converted,
10112705Sjp161948 				    converted_size << 1 + 1);
10122705Sjp161948 				outbuf = (char *)converted + converted_size -
10132705Sjp161948 				    outbytesleft;
10142705Sjp161948 				converted_size = (len << ++(mul));
10152705Sjp161948 				outbytesleft = converted_size - outbytesleft;
10162705Sjp161948 			} else {
10172705Sjp161948 				/*
10182705Sjp161948 				 * let the caller deal with iconv() errors,
10192705Sjp161948 				 * probably by calling fatal(); xfree() does
10202705Sjp161948 				 * not set errno
10212705Sjp161948 				 */
10222705Sjp161948 				if (err)
10232705Sjp161948 					*err = errno;
10242705Sjp161948 				xfree(converted);
10252705Sjp161948 				return (NULL);
10262705Sjp161948 			}
10272705Sjp161948 		}
10282705Sjp161948 	} while (inbytesleft);
10292705Sjp161948 
10302705Sjp161948 	*outbuf = '\0'; /* ensure null-termination */
10312705Sjp161948 	if (outlen)
10322705Sjp161948 		*outlen = converted_size - outbytesleft;
10332705Sjp161948 	if (mul_ptr)
10342705Sjp161948 		*mul_ptr = mul;
10352705Sjp161948 
10362705Sjp161948 	return (converted);
10370Sstevel@tonic-gate }
1038