xref: /onnv-gate/usr/src/cmd/ssh/libssh/common/g11n.c (revision 2705:a88063d9a249)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52628Sjp161948  * Common Development and Distribution License (the "License").
62628Sjp161948  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  *
212628Sjp161948  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
220Sstevel@tonic-gate  * Use is subject to license terms.
230Sstevel@tonic-gate  */
240Sstevel@tonic-gate 
250Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #include <errno.h>
280Sstevel@tonic-gate #include <locale.h>
290Sstevel@tonic-gate #include <langinfo.h>
300Sstevel@tonic-gate #include <iconv.h>
310Sstevel@tonic-gate #include <ctype.h>
320Sstevel@tonic-gate #include <strings.h>
330Sstevel@tonic-gate #include <string.h>
340Sstevel@tonic-gate #include <stdio.h>
350Sstevel@tonic-gate #include <stdlib.h>
360Sstevel@tonic-gate #include "includes.h"
370Sstevel@tonic-gate #include "xmalloc.h"
380Sstevel@tonic-gate #include "xlist.h"
390Sstevel@tonic-gate 
400Sstevel@tonic-gate #ifdef MIN
410Sstevel@tonic-gate #undef MIN
420Sstevel@tonic-gate #endif /* MIN */
430Sstevel@tonic-gate 
44*2705Sjp161948 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
450Sstevel@tonic-gate 
46*2705Sjp161948 #define	LOCALE_PATH	"/usr/bin/locale"
470Sstevel@tonic-gate 
48*2705Sjp161948 /* two-char country code, '-' and two-char region code */
49*2705Sjp161948 #define	LANGTAG_MAX	5
500Sstevel@tonic-gate 
51*2705Sjp161948 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
52*2705Sjp161948     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
530Sstevel@tonic-gate 
540Sstevel@tonic-gate static int locale_cmp(const void *d1, const void *d2);
550Sstevel@tonic-gate static char *g11n_locale2langtag(char *locale);
560Sstevel@tonic-gate 
57*2705Sjp161948 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
58*2705Sjp161948 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
590Sstevel@tonic-gate 
60*2705Sjp161948 static char *
610Sstevel@tonic-gate g11n_locale2langtag(char *locale)
620Sstevel@tonic-gate {
63*2705Sjp161948 	char *langtag;
640Sstevel@tonic-gate 
65*2705Sjp161948 	/* base cases */
66*2705Sjp161948 	if (!locale || !*locale)
67*2705Sjp161948 		return (NULL);
680Sstevel@tonic-gate 
69*2705Sjp161948 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
70*2705Sjp161948 		return ("i-default");
710Sstevel@tonic-gate 
72*2705Sjp161948 	/* punt for language codes which are not exactly 2 letters */
73*2705Sjp161948 	if (strlen(locale) < 2 ||
74*2705Sjp161948 	    !isalpha(locale[0]) ||
75*2705Sjp161948 	    !isalpha(locale[1]) ||
76*2705Sjp161948 	    (locale[2] != '\0' &&
77*2705Sjp161948 	    locale[2] != '_' &&
78*2705Sjp161948 	    locale[2] != '.' &&
79*2705Sjp161948 	    locale[2] != '@'))
80*2705Sjp161948 		return (NULL);
810Sstevel@tonic-gate 
820Sstevel@tonic-gate 
83*2705Sjp161948 	/* we have a primary language sub-tag */
84*2705Sjp161948 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
850Sstevel@tonic-gate 
86*2705Sjp161948 	strncpy(langtag, locale, 2);
87*2705Sjp161948 	langtag[2] = '\0';
880Sstevel@tonic-gate 
89*2705Sjp161948 	/* do we have country sub-tag? For example: cs_CZ */
90*2705Sjp161948 	if (locale[2] == '_') {
91*2705Sjp161948 		if (strlen(locale) < 5 ||
92*2705Sjp161948 		    !isalpha(locale[3]) ||
93*2705Sjp161948 		    !isalpha(locale[4]) ||
94*2705Sjp161948 		    (locale[5] != '\0' && (locale[5] != '.' &&
95*2705Sjp161948 		    locale[5] != '@'))) {
96*2705Sjp161948 			return (langtag);
97*2705Sjp161948 		}
98*2705Sjp161948 
99*2705Sjp161948 		/* example: create cs-CZ from cs_CZ */
100*2705Sjp161948 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
101*2705Sjp161948 		    locale + 3) == 5)
102*2705Sjp161948 			return (langtag);
1030Sstevel@tonic-gate 	}
1040Sstevel@tonic-gate 
105*2705Sjp161948 	/* in all other cases we just use the primary language sub-tag */
106*2705Sjp161948 	return (langtag);
1070Sstevel@tonic-gate }
1080Sstevel@tonic-gate 
109*2705Sjp161948 uint_t
1100Sstevel@tonic-gate g11n_langtag_is_default(char *langtag)
1110Sstevel@tonic-gate {
112*2705Sjp161948 	return (strcmp(langtag, "i-default") == 0);
1130Sstevel@tonic-gate }
1140Sstevel@tonic-gate 
1150Sstevel@tonic-gate /*
1160Sstevel@tonic-gate  * This lang tag / locale matching function works only for two-character
1170Sstevel@tonic-gate  * language primary sub-tags and two-character country sub-tags.
1180Sstevel@tonic-gate  */
119*2705Sjp161948 uint_t
1200Sstevel@tonic-gate g11n_langtag_matches_locale(char *langtag, char *locale)
1210Sstevel@tonic-gate {
122*2705Sjp161948 	/* match "i-default" to the process' current locale if possible */
123*2705Sjp161948 	if (g11n_langtag_is_default(langtag)) {
124*2705Sjp161948 		if (strcasecmp(locale, "POSIX") == 0 ||
125*2705Sjp161948 		    strcasecmp(locale, "C") == 0)
126*2705Sjp161948 			return (1);
127*2705Sjp161948 		else
128*2705Sjp161948 			return (0);
129*2705Sjp161948 	}
1300Sstevel@tonic-gate 
131*2705Sjp161948 	/*
132*2705Sjp161948 	 * locale must be at least 2 chars long and the lang part must be
133*2705Sjp161948 	 * exactly two characters
134*2705Sjp161948 	 */
135*2705Sjp161948 	if (strlen(locale) < 2 ||
136*2705Sjp161948 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
137*2705Sjp161948 	    (locale[2] != '\0' && locale[2] != '_' &&
138*2705Sjp161948 	    locale[2] != '.' && locale[2] != '@')))
139*2705Sjp161948 		return (0);
1400Sstevel@tonic-gate 
141*2705Sjp161948 	/* same thing with the langtag */
142*2705Sjp161948 	if (strlen(langtag) < 2 ||
143*2705Sjp161948 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
144*2705Sjp161948 	    (langtag[2] != '\0' && langtag[2] != '-')))
145*2705Sjp161948 		return (0);
1460Sstevel@tonic-gate 
147*2705Sjp161948 	/* primary language sub-tag and the locale's language part must match */
148*2705Sjp161948 	if (strncasecmp(langtag, locale, 2) != 0)
149*2705Sjp161948 		return (0);
1500Sstevel@tonic-gate 
151*2705Sjp161948 	/*
152*2705Sjp161948 	 * primary language sub-tag and the locale's language match, now
153*2705Sjp161948 	 * fuzzy check country part
154*2705Sjp161948 	 */
1550Sstevel@tonic-gate 
156*2705Sjp161948 	/* neither langtag nor locale have more than one component */
157*2705Sjp161948 	if (langtag[2] == '\0' &&
158*2705Sjp161948 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
159*2705Sjp161948 		return (2);
1600Sstevel@tonic-gate 
161*2705Sjp161948 	/* langtag has only one sub-tag... */
162*2705Sjp161948 	if (langtag[2] == '\0')
163*2705Sjp161948 		return (1);
1640Sstevel@tonic-gate 
165*2705Sjp161948 	/* locale has no country code... */
166*2705Sjp161948 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
167*2705Sjp161948 		return (1);
168*2705Sjp161948 
169*2705Sjp161948 	/* langtag has more than one subtag and the locale has a country code */
1700Sstevel@tonic-gate 
171*2705Sjp161948 	/* ignore second subtag if not two chars */
172*2705Sjp161948 	if (strlen(langtag) < 5)
173*2705Sjp161948 		return (1);
1740Sstevel@tonic-gate 
175*2705Sjp161948 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
176*2705Sjp161948 	    (langtag[5] != '\0' && langtag[5] != '-'))
177*2705Sjp161948 		return (1);
1780Sstevel@tonic-gate 
179*2705Sjp161948 	/* ignore rest of locale if there is no two-character country part */
180*2705Sjp161948 	if (strlen(locale) < 5)
181*2705Sjp161948 		return (1);
1820Sstevel@tonic-gate 
183*2705Sjp161948 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
184*2705Sjp161948 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
185*2705Sjp161948 		return (1);
1860Sstevel@tonic-gate 
187*2705Sjp161948 	/* if the country part matches, return 2 */
188*2705Sjp161948 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
189*2705Sjp161948 		return (2);
1900Sstevel@tonic-gate 
191*2705Sjp161948 	return (1);
1920Sstevel@tonic-gate }
1930Sstevel@tonic-gate 
1940Sstevel@tonic-gate char *
1950Sstevel@tonic-gate g11n_getlocale()
1960Sstevel@tonic-gate {
197*2705Sjp161948 	/* we have one text domain - always set it */
198*2705Sjp161948 	(void) textdomain(TEXT_DOMAIN);
1990Sstevel@tonic-gate 
200*2705Sjp161948 	/* if the locale is not set, set it from the env vars */
201*2705Sjp161948 	if (!setlocale(LC_MESSAGES, NULL))
202*2705Sjp161948 		(void) setlocale(LC_MESSAGES, "");
2030Sstevel@tonic-gate 
204*2705Sjp161948 	return (setlocale(LC_MESSAGES, NULL));
2050Sstevel@tonic-gate }
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate void
2080Sstevel@tonic-gate g11n_setlocale(int category, const char *locale)
2090Sstevel@tonic-gate {
210*2705Sjp161948 	char *curr;
2110Sstevel@tonic-gate 
212*2705Sjp161948 	/* we have one text domain - always set it */
213*2705Sjp161948 	(void) textdomain(TEXT_DOMAIN);
2140Sstevel@tonic-gate 
215*2705Sjp161948 	if (!locale)
216*2705Sjp161948 		return;
2170Sstevel@tonic-gate 
218*2705Sjp161948 	if (*locale && ((curr = setlocale(category, NULL))) &&
219*2705Sjp161948 	    strcmp(curr, locale) == 0)
220*2705Sjp161948 		return;
2212628Sjp161948 
222*2705Sjp161948 	/* if <category> is bogus, setlocale() will do nothing */
223*2705Sjp161948 	(void) setlocale(category, locale);
2240Sstevel@tonic-gate }
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate char **
2270Sstevel@tonic-gate g11n_getlocales()
2280Sstevel@tonic-gate {
229*2705Sjp161948 	FILE *locale_out;
230*2705Sjp161948 	uint_t n_elems, list_size, long_line = 0;
231*2705Sjp161948 	char **list;
232*2705Sjp161948 	char locale[64];	/* 64 bytes is plenty for locale names */
233*2705Sjp161948 
234*2705Sjp161948 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
235*2705Sjp161948 		return (NULL);
2360Sstevel@tonic-gate 
237*2705Sjp161948 	/*
238*2705Sjp161948 	 * start with enough room for 65 locales - that's a lot fewer than
239*2705Sjp161948 	 * all the locales available for installation, but a lot more than
240*2705Sjp161948 	 * what most users will need and install
241*2705Sjp161948 	 */
242*2705Sjp161948 	n_elems = 0;
243*2705Sjp161948 	list_size = 192;
244*2705Sjp161948 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
245*2705Sjp161948 	memset(list, 0, sizeof (char *) * (list_size + 1));
2460Sstevel@tonic-gate 
247*2705Sjp161948 	while (fgets(locale, sizeof (locale), locale_out)) {
248*2705Sjp161948 		/* skip long locale names (if any) */
249*2705Sjp161948 		if (!strchr(locale, '\n')) {
250*2705Sjp161948 			long_line = 1;
251*2705Sjp161948 			continue;
252*2705Sjp161948 		} else if (long_line) {
253*2705Sjp161948 			long_line = 0;
254*2705Sjp161948 			continue;
255*2705Sjp161948 		}
2560Sstevel@tonic-gate 
257*2705Sjp161948 		if (strncmp(locale, "iso_8859", 8) == 0)
258*2705Sjp161948 			/* ignore locale names like "iso_8859-1" */
259*2705Sjp161948 			continue;
2600Sstevel@tonic-gate 
261*2705Sjp161948 		if (n_elems == list_size) {
262*2705Sjp161948 			list_size *= 2;
263*2705Sjp161948 			list = (char **)xrealloc((void *) list,
264*2705Sjp161948 			    (list_size + 1) * sizeof (char *));
265*2705Sjp161948 			memset(&list[n_elems + 1], 0,
266*2705Sjp161948 			    sizeof (char *) * (list_size - n_elems + 1));
267*2705Sjp161948 		}
268*2705Sjp161948 
269*2705Sjp161948 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
270*2705Sjp161948 		list[n_elems++] = xstrdup(locale);
2710Sstevel@tonic-gate 	}
2720Sstevel@tonic-gate 
273*2705Sjp161948 	list[n_elems] = NULL;
274*2705Sjp161948 	(void) pclose(locale_out);
2750Sstevel@tonic-gate 
276*2705Sjp161948 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
277*2705Sjp161948 	return (list);
2780Sstevel@tonic-gate }
2790Sstevel@tonic-gate 
2800Sstevel@tonic-gate char *
2810Sstevel@tonic-gate g11n_getlangs()
2820Sstevel@tonic-gate {
283*2705Sjp161948 	char *locale;
2840Sstevel@tonic-gate 
285*2705Sjp161948 	if (getenv("SSH_LANGS"))
286*2705Sjp161948 		return (xstrdup(getenv("SSH_LANGS")));
2870Sstevel@tonic-gate 
288*2705Sjp161948 	locale = g11n_getlocale();
2890Sstevel@tonic-gate 
290*2705Sjp161948 	if (!locale || !*locale)
291*2705Sjp161948 		return (xstrdup("i-default"));
2920Sstevel@tonic-gate 
293*2705Sjp161948 	return (g11n_locale2langtag(locale));
2940Sstevel@tonic-gate }
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate char *
2970Sstevel@tonic-gate g11n_locales2langs(char **locale_set)
2980Sstevel@tonic-gate {
299*2705Sjp161948 	char **p, **r, **q;
300*2705Sjp161948 	char *langtag;
301*2705Sjp161948 	int locales, skip;
3020Sstevel@tonic-gate 
303*2705Sjp161948 	for (locales = 0, p = locale_set; p && *p; p++)
304*2705Sjp161948 		locales++;
3050Sstevel@tonic-gate 
306*2705Sjp161948 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
307*2705Sjp161948 	memset(r, 0, (locales + 1) * sizeof (char *));
3080Sstevel@tonic-gate 
309*2705Sjp161948 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
310*2705Sjp161948 		skip = 0;
311*2705Sjp161948 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
312*2705Sjp161948 			continue;
313*2705Sjp161948 		for (q = r; (q - r) < locales; q++) {
314*2705Sjp161948 			if (!*q)
315*2705Sjp161948 				break;
316*2705Sjp161948 			if (*q && strcmp(*q, langtag) == 0)
317*2705Sjp161948 				skip = 1;
318*2705Sjp161948 		}
319*2705Sjp161948 		if (!skip)
320*2705Sjp161948 			*(q++) = langtag;
321*2705Sjp161948 		*q = NULL;
3220Sstevel@tonic-gate 	}
323*2705Sjp161948 
324*2705Sjp161948 	return (xjoin(r, ','));
3250Sstevel@tonic-gate }
3260Sstevel@tonic-gate 
327*2705Sjp161948 static int
3280Sstevel@tonic-gate sortcmp(const void *d1, const void *d2)
3290Sstevel@tonic-gate {
330*2705Sjp161948 	char *s1 = *(char **)d1;
331*2705Sjp161948 	char *s2 = *(char **)d2;
3320Sstevel@tonic-gate 
333*2705Sjp161948 	return (strcmp(s1, s2));
3340Sstevel@tonic-gate }
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate int
3370Sstevel@tonic-gate g11n_langtag_match(char *langtag1, char *langtag2)
3380Sstevel@tonic-gate {
339*2705Sjp161948 	int len1, len2;
340*2705Sjp161948 	char c1, c2;
3410Sstevel@tonic-gate 
342*2705Sjp161948 	len1 = (strchr(langtag1, '-')) ?
3430Sstevel@tonic-gate 		(strchr(langtag1, '-') - langtag1)
3440Sstevel@tonic-gate 		: strlen(langtag1);
3450Sstevel@tonic-gate 
346*2705Sjp161948 	len2 = (strchr(langtag2, '-')) ?
3470Sstevel@tonic-gate 		(strchr(langtag2, '-') - langtag2)
3480Sstevel@tonic-gate 		: strlen(langtag2);
3490Sstevel@tonic-gate 
350*2705Sjp161948 	/* no match */
351*2705Sjp161948 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
352*2705Sjp161948 		return (0);
3530Sstevel@tonic-gate 
354*2705Sjp161948 	c1 = *(langtag1 + len1);
355*2705Sjp161948 	c2 = *(langtag2 + len2);
3560Sstevel@tonic-gate 
357*2705Sjp161948 	/* no country sub-tags - exact match */
358*2705Sjp161948 	if (c1 == '\0' && c2 == '\0')
359*2705Sjp161948 		return (2);
3600Sstevel@tonic-gate 
361*2705Sjp161948 	/* one langtag has a country sub-tag, the other doesn't */
362*2705Sjp161948 	if (c1 == '\0' || c2 == '\0')
363*2705Sjp161948 		return (1);
3640Sstevel@tonic-gate 
365*2705Sjp161948 	/* can't happen - both langtags have a country sub-tag */
366*2705Sjp161948 	if (c1 != '-' || c2 != '-')
367*2705Sjp161948 		return (1);
3680Sstevel@tonic-gate 
369*2705Sjp161948 	/* compare country subtags */
370*2705Sjp161948 	langtag1 = langtag1 + len1 + 1;
371*2705Sjp161948 	langtag2 = langtag2 + len2 + 1;
3720Sstevel@tonic-gate 
373*2705Sjp161948 	len1 = (strchr(langtag1, '-')) ?
374*2705Sjp161948 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
375*2705Sjp161948 
376*2705Sjp161948 	len2 = (strchr(langtag2, '-')) ?
377*2705Sjp161948 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
3780Sstevel@tonic-gate 
379*2705Sjp161948 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
380*2705Sjp161948 		return (1);
3810Sstevel@tonic-gate 
382*2705Sjp161948 	/* country tags matched - exact match */
383*2705Sjp161948 	return (2);
3840Sstevel@tonic-gate }
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate char *
3870Sstevel@tonic-gate g11n_langtag_set_intersect(char *set1, char *set2)
3880Sstevel@tonic-gate {
389*2705Sjp161948 	char **list1, **list2, **list3, **p, **q, **r;
390*2705Sjp161948 	char *set3, *lang_subtag;
391*2705Sjp161948 	uint_t n1, n2, n3;
392*2705Sjp161948 	uint_t do_append;
393*2705Sjp161948 
394*2705Sjp161948 	list1 = xsplit(set1, ',');
395*2705Sjp161948 	list2 = xsplit(set2, ',');
3960Sstevel@tonic-gate 
397*2705Sjp161948 	for (n1 = 0, p = list1; p && *p; p++, n1++)
398*2705Sjp161948 		;
399*2705Sjp161948 	for (n2 = 0, p = list2; p && *p; p++, n2++)
400*2705Sjp161948 		;
4010Sstevel@tonic-gate 
402*2705Sjp161948 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
403*2705Sjp161948 	*list3 = NULL;
4040Sstevel@tonic-gate 
405*2705Sjp161948 	/*
406*2705Sjp161948 	 * we must not sort the user langtags - sorting or not the server's
407*2705Sjp161948 	 * should not affect the outcome
408*2705Sjp161948 	 */
409*2705Sjp161948 	qsort(list2, n2, sizeof (char *), sortcmp);
4100Sstevel@tonic-gate 
411*2705Sjp161948 	for (n3 = 0, p = list1; p && *p; p++) {
412*2705Sjp161948 		do_append = 0;
413*2705Sjp161948 		for (q = list2; q && *q; q++) {
414*2705Sjp161948 			if (g11n_langtag_match(*p, *q) != 2) continue;
415*2705Sjp161948 			/* append element */
416*2705Sjp161948 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
417*2705Sjp161948 				do_append = 1;
418*2705Sjp161948 				if (!*r)
419*2705Sjp161948 					break;
420*2705Sjp161948 				if (strcmp(*p, *r) == 0) {
421*2705Sjp161948 					do_append = 0;
422*2705Sjp161948 					break;
423*2705Sjp161948 				}
424*2705Sjp161948 			}
425*2705Sjp161948 			if (do_append && n3 <= (n1 + n2)) {
426*2705Sjp161948 				list3[n3++] = xstrdup(*p);
427*2705Sjp161948 				list3[n3] = NULL;
428*2705Sjp161948 			}
4290Sstevel@tonic-gate 		}
4300Sstevel@tonic-gate 	}
431*2705Sjp161948 
432*2705Sjp161948 	for (p = list1; p && *p; p++) {
433*2705Sjp161948 		do_append = 0;
434*2705Sjp161948 		for (q = list2; q && *q; q++) {
435*2705Sjp161948 			if (g11n_langtag_match(*p, *q) != 1)
436*2705Sjp161948 				continue;
4370Sstevel@tonic-gate 
438*2705Sjp161948 			/* append element */
439*2705Sjp161948 			lang_subtag = xstrdup(*p);
440*2705Sjp161948 			if (strchr(lang_subtag, '-'))
441*2705Sjp161948 				*(strchr(lang_subtag, '-')) = '\0';
442*2705Sjp161948 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
443*2705Sjp161948 				do_append = 1;
444*2705Sjp161948 				if (!*r)
445*2705Sjp161948 					break;
446*2705Sjp161948 				if (strcmp(lang_subtag, *r) == 0) {
447*2705Sjp161948 					do_append = 0;
448*2705Sjp161948 					break;
449*2705Sjp161948 				}
450*2705Sjp161948 			}
451*2705Sjp161948 			if (do_append && n3 <= (n1 + n2)) {
452*2705Sjp161948 				list3[n3++] = lang_subtag;
453*2705Sjp161948 				list3[n3] = NULL;
454*2705Sjp161948 			} else
455*2705Sjp161948 				xfree(lang_subtag);
4560Sstevel@tonic-gate 		}
4570Sstevel@tonic-gate 	}
4580Sstevel@tonic-gate 
459*2705Sjp161948 	set3 = xjoin(list3, ',');
460*2705Sjp161948 	xfree_split_list(list1);
461*2705Sjp161948 	xfree_split_list(list2);
462*2705Sjp161948 	xfree_split_list(list3);
4630Sstevel@tonic-gate 
464*2705Sjp161948 	return (set3);
4650Sstevel@tonic-gate }
4660Sstevel@tonic-gate 
4670Sstevel@tonic-gate char *
4680Sstevel@tonic-gate g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
4690Sstevel@tonic-gate {
470*2705Sjp161948 	char *list, *result;
471*2705Sjp161948 	char **xlist;
4720Sstevel@tonic-gate 
473*2705Sjp161948 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
474*2705Sjp161948 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
4750Sstevel@tonic-gate 
476*2705Sjp161948 	if (!list)
477*2705Sjp161948 		return (NULL);
4780Sstevel@tonic-gate 
479*2705Sjp161948 	xlist = xsplit(list, ',');
4800Sstevel@tonic-gate 
481*2705Sjp161948 	xfree(list);
4820Sstevel@tonic-gate 
483*2705Sjp161948 	if (!xlist || !*xlist)
484*2705Sjp161948 		return (NULL);
4850Sstevel@tonic-gate 
486*2705Sjp161948 	result = xstrdup(*xlist);
487*2705Sjp161948 	xfree_split_list(xlist);
4880Sstevel@tonic-gate 
489*2705Sjp161948 	return (result);
4900Sstevel@tonic-gate }
4910Sstevel@tonic-gate 
4920Sstevel@tonic-gate /*
4930Sstevel@tonic-gate  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
4940Sstevel@tonic-gate  * a stright strcmp()
4950Sstevel@tonic-gate  */
496*2705Sjp161948 static int
4970Sstevel@tonic-gate locale_cmp(const void *d1, const void *d2)
4980Sstevel@tonic-gate {
499*2705Sjp161948 	char *dot_ptr;
500*2705Sjp161948 	char *s1 = *(char **)d1;
501*2705Sjp161948 	char *s2 = *(char **)d2;
502*2705Sjp161948 	int s1_is_utf8 = 0;
503*2705Sjp161948 	int s2_is_utf8 = 0;
5040Sstevel@tonic-gate 
505*2705Sjp161948 	/* check if s1 is a UTF-8 locale */
506*2705Sjp161948 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
507*2705Sjp161948 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
508*2705Sjp161948 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
509*2705Sjp161948 		s1_is_utf8++;
510*2705Sjp161948 	}
511*2705Sjp161948 
512*2705Sjp161948 	/* check if s2 is a UTF-8 locale */
513*2705Sjp161948 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
514*2705Sjp161948 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
515*2705Sjp161948 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
516*2705Sjp161948 		s2_is_utf8++;
517*2705Sjp161948 	}
5180Sstevel@tonic-gate 
519*2705Sjp161948 	/* prefer UTF-8 locales */
520*2705Sjp161948 	if (s1_is_utf8 && !s2_is_utf8)
521*2705Sjp161948 		return (-1);
5220Sstevel@tonic-gate 
523*2705Sjp161948 	if (s2_is_utf8 && !s1_is_utf8)
524*2705Sjp161948 		return (1);
5250Sstevel@tonic-gate 
526*2705Sjp161948 	/* prefer any locale over the default locales */
527*2705Sjp161948 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
528*2705Sjp161948 	    strcmp(s1, "common") == 0) {
529*2705Sjp161948 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
530*2705Sjp161948 		    strcmp(s2, "common") != 0)
531*2705Sjp161948 			return (1);
532*2705Sjp161948 	}
5330Sstevel@tonic-gate 
534*2705Sjp161948 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
535*2705Sjp161948 	    strcmp(s2, "common") == 0) {
536*2705Sjp161948 		if (strcmp(s1, "C") != 0 &&
537*2705Sjp161948 		    strcmp(s1, "POSIX") != 0 &&
538*2705Sjp161948 		    strcmp(s1, "common") != 0)
539*2705Sjp161948 			return (-1);
540*2705Sjp161948 	}
5410Sstevel@tonic-gate 
542*2705Sjp161948 	return (strcmp(s1, s2));
5430Sstevel@tonic-gate }
5440Sstevel@tonic-gate 
5450Sstevel@tonic-gate 
5460Sstevel@tonic-gate char **
547*2705Sjp161948 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
5480Sstevel@tonic-gate {
549*2705Sjp161948 	char **langtag_list, **result, **p, **q, **r;
550*2705Sjp161948 	char *s;
551*2705Sjp161948 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
552*2705Sjp161948 
553*2705Sjp161948 	/* count lang tags and locales */
554*2705Sjp161948 	for (n_locales = 0, p = locale_set; p && *p; p++)
555*2705Sjp161948 		n_locales++;
5560Sstevel@tonic-gate 
557*2705Sjp161948 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
558*2705Sjp161948 	/* count the number of langtags */
559*2705Sjp161948 	for (; s = strchr(s, ','); s++, n_langtags++)
560*2705Sjp161948 		;
561*2705Sjp161948 
562*2705Sjp161948 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
5630Sstevel@tonic-gate 
564*2705Sjp161948 	langtag_list = xsplit(langtag_set, ',');
565*2705Sjp161948 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
566*2705Sjp161948 		;
5670Sstevel@tonic-gate 
568*2705Sjp161948 	max_results = MIN(n_locales, n_langtags) * 2;
569*2705Sjp161948 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
570*2705Sjp161948 	*result = NULL;
571*2705Sjp161948 	n_results = 0;
5720Sstevel@tonic-gate 
573*2705Sjp161948 	/* more specific matches first */
574*2705Sjp161948 	for (p = langtag_list; p && *p; p++) {
575*2705Sjp161948 		do_append = 0;
576*2705Sjp161948 		for (q = locale_set; q && *q; q++) {
577*2705Sjp161948 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
578*2705Sjp161948 				do_append = 1;
579*2705Sjp161948 				for (r = result; (r - result) <=
580*2705Sjp161948 				    MIN(n_locales, n_langtags); r++) {
581*2705Sjp161948 					if (!*r)
582*2705Sjp161948 						break;
583*2705Sjp161948 					if (strcmp(*q, *r) == 0) {
584*2705Sjp161948 						do_append = 0;
585*2705Sjp161948 						break;
586*2705Sjp161948 					}
587*2705Sjp161948 				}
588*2705Sjp161948 				if (do_append && n_results < max_results) {
589*2705Sjp161948 					result[n_results++] = xstrdup(*q);
590*2705Sjp161948 					result[n_results] = NULL;
591*2705Sjp161948 				}
592*2705Sjp161948 				break;
593*2705Sjp161948 			}
5940Sstevel@tonic-gate 		}
5950Sstevel@tonic-gate 	}
5960Sstevel@tonic-gate 
597*2705Sjp161948 	for (p = langtag_list; p && *p; p++) {
598*2705Sjp161948 		do_append = 0;
599*2705Sjp161948 		for (q = locale_set; q && *q; q++) {
600*2705Sjp161948 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
601*2705Sjp161948 				do_append = 1;
602*2705Sjp161948 				for (r = result; (r - result) <=
603*2705Sjp161948 				    MIN(n_locales, n_langtags); r++) {
604*2705Sjp161948 					if (!*r)
605*2705Sjp161948 						break;
606*2705Sjp161948 					if (strcmp(*q, *r) == 0) {
607*2705Sjp161948 						do_append = 0;
608*2705Sjp161948 						break;
609*2705Sjp161948 					}
610*2705Sjp161948 				}
611*2705Sjp161948 				if (do_append && n_results < max_results) {
612*2705Sjp161948 					result[n_results++] = xstrdup(*q);
613*2705Sjp161948 					result[n_results] = NULL;
614*2705Sjp161948 				}
615*2705Sjp161948 				break;
616*2705Sjp161948 			}
6170Sstevel@tonic-gate 		}
6180Sstevel@tonic-gate 	}
6190Sstevel@tonic-gate 
620*2705Sjp161948 	xfree_split_list(langtag_list);
621*2705Sjp161948 
622*2705Sjp161948 	return (result);
6230Sstevel@tonic-gate }
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate char *
6260Sstevel@tonic-gate g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
6270Sstevel@tonic-gate {
628*2705Sjp161948 	char **results, *result = NULL;
6290Sstevel@tonic-gate 
630*2705Sjp161948 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
6310Sstevel@tonic-gate 	    srvr_locales ?  srvr_locales : g11n_getlocales())) == NULL)
632*2705Sjp161948 		return (NULL);
6330Sstevel@tonic-gate 
634*2705Sjp161948 	if (*results != NULL)
635*2705Sjp161948 		result = xstrdup(*results);
6360Sstevel@tonic-gate 
637*2705Sjp161948 	xfree_split_list(results);
6380Sstevel@tonic-gate 
639*2705Sjp161948 	return (result);
6400Sstevel@tonic-gate }
6410Sstevel@tonic-gate 
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate /*
6440Sstevel@tonic-gate  * Functions for validating ASCII and UTF-8 strings
6450Sstevel@tonic-gate  *
6460Sstevel@tonic-gate  * The error_str parameter is an optional pointer to a char variable
6470Sstevel@tonic-gate  * where to store a string suitable for use with error() or fatal() or
6480Sstevel@tonic-gate  * friends.
6490Sstevel@tonic-gate  *
6500Sstevel@tonic-gate  * The return value is 0 if success, EILSEQ or EINVAL.
6510Sstevel@tonic-gate  *
6520Sstevel@tonic-gate  */
653*2705Sjp161948 uint_t
654*2705Sjp161948 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
6550Sstevel@tonic-gate {
656*2705Sjp161948 	uchar_t *p;
6570Sstevel@tonic-gate 
658*2705Sjp161948 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
659*2705Sjp161948 		;
6600Sstevel@tonic-gate 
661*2705Sjp161948 	if (len && ((p - (uchar_t *)str) != len))
662*2705Sjp161948 		return (EILSEQ);
663*2705Sjp161948 
664*2705Sjp161948 	return (0);
6650Sstevel@tonic-gate }
6660Sstevel@tonic-gate 
667*2705Sjp161948 uint_t
668*2705Sjp161948 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
6690Sstevel@tonic-gate {
670*2705Sjp161948 	uchar_t *p;
671*2705Sjp161948 	uint_t c, l;
672*2705Sjp161948 
673*2705Sjp161948 	if (len == 0)
674*2705Sjp161948 		len = strlen((const char *)str);
6750Sstevel@tonic-gate 
676*2705Sjp161948 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
677*2705Sjp161948 		/* 8-bit chars begin a UTF-8 sequence */
678*2705Sjp161948 		if (*p & 0x80) {
679*2705Sjp161948 			/* get sequence length and sanity check first byte */
680*2705Sjp161948 			if (*p < 0xc0)
681*2705Sjp161948 				return (EILSEQ);
682*2705Sjp161948 			else if (*p < 0xe0)
683*2705Sjp161948 				l = 2;
684*2705Sjp161948 			else if (*p < 0xf0)
685*2705Sjp161948 				l = 3;
686*2705Sjp161948 			else if (*p < 0xf8)
687*2705Sjp161948 				l = 4;
688*2705Sjp161948 			else if (*p < 0xfc)
689*2705Sjp161948 				l = 5;
690*2705Sjp161948 			else if (*p < 0xfe)
691*2705Sjp161948 				l = 6;
692*2705Sjp161948 			else
693*2705Sjp161948 				return (EILSEQ);
694*2705Sjp161948 
695*2705Sjp161948 			if ((p + l - str) >= len)
696*2705Sjp161948 				return (EILSEQ);
697*2705Sjp161948 
698*2705Sjp161948 			/* overlong detection - build codepoint */
699*2705Sjp161948 			c = *p & 0x3f;
700*2705Sjp161948 			/* shift c bits from first byte */
701*2705Sjp161948 			c = c << (6 * (l - 1));
702*2705Sjp161948 
703*2705Sjp161948 			if (l > 1) {
704*2705Sjp161948 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
705*2705Sjp161948 					c = c | ((*(p + 1) & 0x3f) <<
706*2705Sjp161948 					    (6 * (l - 2)));
707*2705Sjp161948 				else
708*2705Sjp161948 					return (EILSEQ);
709*2705Sjp161948 
710*2705Sjp161948 				if (c < 0x80)
711*2705Sjp161948 					return (EILSEQ);
712*2705Sjp161948 			}
7130Sstevel@tonic-gate 
714*2705Sjp161948 			if (l > 2) {
715*2705Sjp161948 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
716*2705Sjp161948 					c = c | ((*(p + 2) & 0x3f) <<
717*2705Sjp161948 					    (6 * (l - 3)));
718*2705Sjp161948 				else
719*2705Sjp161948 					return (EILSEQ);
720*2705Sjp161948 
721*2705Sjp161948 				if (c < 0x800)
722*2705Sjp161948 					return (EILSEQ);
723*2705Sjp161948 			}
724*2705Sjp161948 
725*2705Sjp161948 			if (l > 3) {
726*2705Sjp161948 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
727*2705Sjp161948 					c = c | ((*(p + 3) & 0x3f) <<
728*2705Sjp161948 					    (6 * (l - 4)));
729*2705Sjp161948 				else
730*2705Sjp161948 					return (EILSEQ);
731*2705Sjp161948 
732*2705Sjp161948 				if (c < 0x10000)
733*2705Sjp161948 					return (EILSEQ);
734*2705Sjp161948 			}
7350Sstevel@tonic-gate 
736*2705Sjp161948 			if (l > 4) {
737*2705Sjp161948 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
738*2705Sjp161948 					c = c | ((*(p + 4) & 0x3f) <<
739*2705Sjp161948 					    (6 * (l - 5)));
740*2705Sjp161948 				else
741*2705Sjp161948 					return (EILSEQ);
742*2705Sjp161948 
743*2705Sjp161948 				if (c < 0x200000)
744*2705Sjp161948 					return (EILSEQ);
745*2705Sjp161948 			}
7460Sstevel@tonic-gate 
747*2705Sjp161948 			if (l > 5) {
748*2705Sjp161948 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
749*2705Sjp161948 					c = c | (*(p + 5) & 0x3f);
750*2705Sjp161948 				else
751*2705Sjp161948 					return (EILSEQ);
752*2705Sjp161948 
753*2705Sjp161948 				if (c < 0x4000000)
754*2705Sjp161948 					return (EILSEQ);
755*2705Sjp161948 			}
7560Sstevel@tonic-gate 
757*2705Sjp161948 			/*
758*2705Sjp161948 			 * check for UTF-16 surrogates ifs other illegal
759*2705Sjp161948 			 * UTF-8 * points
760*2705Sjp161948 			 */
761*2705Sjp161948 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
762*2705Sjp161948 			    (c == 0xfffe) || (c == 0xffff))
763*2705Sjp161948 				return (EILSEQ);
764*2705Sjp161948 			p += l;
765*2705Sjp161948 		}
766*2705Sjp161948 		/* 7-bit chars are fine */
7670Sstevel@tonic-gate 		else
768*2705Sjp161948 			p++;
7690Sstevel@tonic-gate 	}
770*2705Sjp161948 	return (0);
7710Sstevel@tonic-gate }
7720Sstevel@tonic-gate 
7730Sstevel@tonic-gate /*
7740Sstevel@tonic-gate  * Functions for converting to ASCII or UTF-8 from the local codeset
7750Sstevel@tonic-gate  * Functions for converting from ASCII or UTF-8 to the local codeset
7760Sstevel@tonic-gate  *
7770Sstevel@tonic-gate  * The error_str parameter is an optional pointer to a char variable
7780Sstevel@tonic-gate  * where to store a string suitable for use with error() or fatal() or
7790Sstevel@tonic-gate  * friends.
7800Sstevel@tonic-gate  *
7810Sstevel@tonic-gate  * The err parameter is an optional pointer to an integer where 0
7820Sstevel@tonic-gate  * (success) or EILSEQ or EINVAL will be stored (failure).
7830Sstevel@tonic-gate  *
7840Sstevel@tonic-gate  * These functions return NULL if the conversion fails.
7850Sstevel@tonic-gate  *
7860Sstevel@tonic-gate  */
787*2705Sjp161948 uchar_t *
788*2705Sjp161948 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
7890Sstevel@tonic-gate {
790*2705Sjp161948 	static uint_t initialized = 0;
791*2705Sjp161948 	static uint_t do_convert = 0;
792*2705Sjp161948 	iconv_t cd;
793*2705Sjp161948 	int err;
7940Sstevel@tonic-gate 
795*2705Sjp161948 	if (!initialized) {
796*2705Sjp161948 		/*
797*2705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
798*2705Sjp161948 		 * same, and there are aliases of codesets to boot...
799*2705Sjp161948 		 */
800*2705Sjp161948 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
801*2705Sjp161948 			strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
802*2705Sjp161948 			strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
803*2705Sjp161948 			initialized = 1;
804*2705Sjp161948 			do_convert = 0;
805*2705Sjp161948 		} else {
806*2705Sjp161948 			cd = iconv_open(nl_langinfo(CODESET), "646");
807*2705Sjp161948 			if (cd == (iconv_t)-1) {
808*2705Sjp161948 				if (err_ptr)
809*2705Sjp161948 					*err_ptr = errno;
810*2705Sjp161948 				if (error_str)
811*2705Sjp161948 					*error_str = (uchar_t *)"Cannot "
812*2705Sjp161948 					    "convert ASCII strings to the local"
813*2705Sjp161948 					    " codeset";
814*2705Sjp161948 			}
815*2705Sjp161948 			initialized = 1;
816*2705Sjp161948 			do_convert = 1;
817*2705Sjp161948 		}
8180Sstevel@tonic-gate 	}
819*2705Sjp161948 
820*2705Sjp161948 	if (!do_convert) {
821*2705Sjp161948 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
822*2705Sjp161948 			if (err_ptr)
823*2705Sjp161948 				*err_ptr = err;
824*2705Sjp161948 			return (NULL);
825*2705Sjp161948 		} else
826*2705Sjp161948 			return ((uchar_t *)xstrdup(str));
8270Sstevel@tonic-gate 	}
8280Sstevel@tonic-gate 
829*2705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
8300Sstevel@tonic-gate }
8310Sstevel@tonic-gate 
832*2705Sjp161948 uchar_t *
833*2705Sjp161948 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
8340Sstevel@tonic-gate {
835*2705Sjp161948 	static uint_t initialized = 0;
836*2705Sjp161948 	static uint_t do_convert = 0;
837*2705Sjp161948 	iconv_t cd;
838*2705Sjp161948 	int err;
8390Sstevel@tonic-gate 
840*2705Sjp161948 	if (!initialized) {
841*2705Sjp161948 		/*
842*2705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
843*2705Sjp161948 		 * same, and there are aliases of codesets to boot...
844*2705Sjp161948 		 */
845*2705Sjp161948 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
846*2705Sjp161948 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
847*2705Sjp161948 			initialized = 1;
848*2705Sjp161948 			do_convert = 0;
849*2705Sjp161948 		} else {
850*2705Sjp161948 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
851*2705Sjp161948 			if (cd == (iconv_t)-1) {
852*2705Sjp161948 				if (err_ptr)
853*2705Sjp161948 					*err_ptr = errno;
854*2705Sjp161948 				if (error_str)
855*2705Sjp161948 					*error_str = (uchar_t *)"Cannot "
856*2705Sjp161948 					    "convert UTF-8 strings to the "
857*2705Sjp161948 					    "local codeset";
858*2705Sjp161948 			}
859*2705Sjp161948 			initialized = 1;
860*2705Sjp161948 			do_convert = 1;
861*2705Sjp161948 		}
8620Sstevel@tonic-gate 	}
863*2705Sjp161948 
864*2705Sjp161948 	if (!do_convert) {
865*2705Sjp161948 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
866*2705Sjp161948 			if (err_ptr)
867*2705Sjp161948 				*err_ptr = err;
868*2705Sjp161948 			return (NULL);
869*2705Sjp161948 		} else
870*2705Sjp161948 			return ((uchar_t *)xstrdup((char *)str));
8710Sstevel@tonic-gate 	}
8720Sstevel@tonic-gate 
873*2705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
8740Sstevel@tonic-gate }
8750Sstevel@tonic-gate 
8760Sstevel@tonic-gate char *
877*2705Sjp161948 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
8780Sstevel@tonic-gate {
879*2705Sjp161948 	static uint_t initialized = 0;
880*2705Sjp161948 	static uint_t do_convert = 0;
881*2705Sjp161948 	iconv_t cd;
8820Sstevel@tonic-gate 
883*2705Sjp161948 	if (!initialized) {
884*2705Sjp161948 		/*
885*2705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
886*2705Sjp161948 		 * same, and there are aliases of codesets to boot...
887*2705Sjp161948 		 */
888*2705Sjp161948 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
889*2705Sjp161948 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
890*2705Sjp161948 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
891*2705Sjp161948 			initialized = 1;
892*2705Sjp161948 			do_convert = 0;
893*2705Sjp161948 		} else {
894*2705Sjp161948 			cd = iconv_open("646", nl_langinfo(CODESET));
895*2705Sjp161948 			if (cd == (iconv_t)-1) {
896*2705Sjp161948 				if (err_ptr)
897*2705Sjp161948 					*err_ptr = errno;
898*2705Sjp161948 				if (error_str)
899*2705Sjp161948 					*error_str = (uchar_t *)"Cannot "
900*2705Sjp161948 					    "convert UTF-8 strings to the "
901*2705Sjp161948 					    "local codeset";
902*2705Sjp161948 			}
903*2705Sjp161948 			initialized = 1;
904*2705Sjp161948 			do_convert = 1;
905*2705Sjp161948 		}
9060Sstevel@tonic-gate 	}
9070Sstevel@tonic-gate 
908*2705Sjp161948 	if (!do_convert)
909*2705Sjp161948 		return (xstrdup((char *)str));
910*2705Sjp161948 
911*2705Sjp161948 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
9120Sstevel@tonic-gate }
9130Sstevel@tonic-gate 
914*2705Sjp161948 uchar_t *
915*2705Sjp161948 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
9160Sstevel@tonic-gate {
917*2705Sjp161948 	static uint_t initialized = 0;
918*2705Sjp161948 	static uint_t do_convert = 0;
919*2705Sjp161948 	iconv_t cd;
9200Sstevel@tonic-gate 
921*2705Sjp161948 	if (!initialized) {
922*2705Sjp161948 		/*
923*2705Sjp161948 		 * iconv_open() fails if the to/from codesets are the
924*2705Sjp161948 		 * same, and there are aliases of codesets to boot...
925*2705Sjp161948 		 */
926*2705Sjp161948 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
927*2705Sjp161948 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
928*2705Sjp161948 			initialized = 1;
929*2705Sjp161948 			do_convert = 0;
930*2705Sjp161948 		} else {
931*2705Sjp161948 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
932*2705Sjp161948 			if (cd == (iconv_t)-1) {
933*2705Sjp161948 				if (err_ptr)
934*2705Sjp161948 					*err_ptr = errno;
935*2705Sjp161948 				if (error_str)
936*2705Sjp161948 					*error_str = (uchar_t *)"Cannot "
937*2705Sjp161948 					    "convert UTF-8 strings to the "
938*2705Sjp161948 					    "local codeset";
939*2705Sjp161948 			}
940*2705Sjp161948 			initialized = 1;
941*2705Sjp161948 			do_convert = 1;
942*2705Sjp161948 		}
9430Sstevel@tonic-gate 	}
9440Sstevel@tonic-gate 
945*2705Sjp161948 	if (!do_convert)
946*2705Sjp161948 		return ((uchar_t *)xstrdup((char *)str));
947*2705Sjp161948 
948*2705Sjp161948 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
9490Sstevel@tonic-gate }
9500Sstevel@tonic-gate 
9510Sstevel@tonic-gate 
9520Sstevel@tonic-gate /*
9530Sstevel@tonic-gate  * Wrapper around iconv()
9540Sstevel@tonic-gate  *
9550Sstevel@tonic-gate  * The caller is responsible for freeing the result and for handling
9560Sstevel@tonic-gate  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
9570Sstevel@tonic-gate  */
958*2705Sjp161948 static uchar_t *
959*2705Sjp161948 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
960*2705Sjp161948     uint_t *outlen, int *err, uchar_t **err_str)
961*2705Sjp161948 {
962*2705Sjp161948 	size_t inbytesleft, outbytesleft, converted_size;
963*2705Sjp161948 	char *outbuf;
964*2705Sjp161948 	uchar_t *converted;
965*2705Sjp161948 	const char *inbuf;
966*2705Sjp161948 	uint_t mul = 0;
9670Sstevel@tonic-gate 
968*2705Sjp161948 	if (!buf || !(*(char *)buf))
969*2705Sjp161948 		return (NULL);
970*2705Sjp161948 
971*2705Sjp161948 	if (len == 0)
972*2705Sjp161948 		len = strlen(buf);
973*2705Sjp161948 
974*2705Sjp161948 	/* reset conversion descriptor */
975*2705Sjp161948 	/* XXX Do we need initial shift sequences for UTF-8??? */
976*2705Sjp161948 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
977*2705Sjp161948 	inbuf = (const char *) buf;
978*2705Sjp161948 
979*2705Sjp161948 	if (mul_ptr)
980*2705Sjp161948 		mul = *mul_ptr;
981*2705Sjp161948 
982*2705Sjp161948 	converted_size = (len << mul);
983*2705Sjp161948 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
984*2705Sjp161948 	converted = (uchar_t *)outbuf;
985*2705Sjp161948 	outbytesleft = len;
9860Sstevel@tonic-gate 
987*2705Sjp161948 	do {
988*2705Sjp161948 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
989*2705Sjp161948 		    (size_t)-1) {
990*2705Sjp161948 			if (errno == E2BIG) {
991*2705Sjp161948 				/* UTF-8 codepoints are at most 8 bytes long */
992*2705Sjp161948 				if (mul > 2) {
993*2705Sjp161948 					if (err_str)
994*2705Sjp161948 						*err_str = (uchar_t *)
995*2705Sjp161948 						    "Conversion to UTF-8 failed"
996*2705Sjp161948 						    " due to preposterous space"
997*2705Sjp161948 						    " requirements";
998*2705Sjp161948 					if (err)
999*2705Sjp161948 						*err = EILSEQ;
1000*2705Sjp161948 					return (NULL);
1001*2705Sjp161948 				}
10020Sstevel@tonic-gate 
1003*2705Sjp161948 				/*
1004*2705Sjp161948 				 * re-alloc output and ensure that the outbuf
1005*2705Sjp161948 				 * and outbytesleft values are adjusted
1006*2705Sjp161948 				 */
1007*2705Sjp161948 				converted = xrealloc(converted,
1008*2705Sjp161948 				    converted_size << 1 + 1);
1009*2705Sjp161948 				outbuf = (char *)converted + converted_size -
1010*2705Sjp161948 				    outbytesleft;
1011*2705Sjp161948 				converted_size = (len << ++(mul));
1012*2705Sjp161948 				outbytesleft = converted_size - outbytesleft;
1013*2705Sjp161948 			} else {
1014*2705Sjp161948 				/*
1015*2705Sjp161948 				 * let the caller deal with iconv() errors,
1016*2705Sjp161948 				 * probably by calling fatal(); xfree() does
1017*2705Sjp161948 				 * not set errno
1018*2705Sjp161948 				 */
1019*2705Sjp161948 				if (err)
1020*2705Sjp161948 					*err = errno;
1021*2705Sjp161948 				xfree(converted);
1022*2705Sjp161948 				return (NULL);
1023*2705Sjp161948 			}
1024*2705Sjp161948 		}
1025*2705Sjp161948 	} while (inbytesleft);
1026*2705Sjp161948 
1027*2705Sjp161948 	*outbuf = '\0'; /* ensure null-termination */
1028*2705Sjp161948 	if (outlen)
1029*2705Sjp161948 		*outlen = converted_size - outbytesleft;
1030*2705Sjp161948 	if (mul_ptr)
1031*2705Sjp161948 		*mul_ptr = mul;
1032*2705Sjp161948 
1033*2705Sjp161948 	return (converted);
10340Sstevel@tonic-gate }
1035