10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 52628Sjp161948 * Common Development and Distribution License (the "License"). 62628Sjp161948 * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate * 212628Sjp161948 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 220Sstevel@tonic-gate * Use is subject to license terms. 230Sstevel@tonic-gate */ 240Sstevel@tonic-gate 250Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 260Sstevel@tonic-gate 270Sstevel@tonic-gate #include <errno.h> 280Sstevel@tonic-gate #include <locale.h> 290Sstevel@tonic-gate #include <langinfo.h> 300Sstevel@tonic-gate #include <iconv.h> 310Sstevel@tonic-gate #include <ctype.h> 320Sstevel@tonic-gate #include <strings.h> 330Sstevel@tonic-gate #include <string.h> 340Sstevel@tonic-gate #include <stdio.h> 350Sstevel@tonic-gate #include <stdlib.h> 360Sstevel@tonic-gate #include "includes.h" 370Sstevel@tonic-gate #include "xmalloc.h" 380Sstevel@tonic-gate #include "xlist.h" 390Sstevel@tonic-gate 400Sstevel@tonic-gate #ifdef MIN 410Sstevel@tonic-gate #undef MIN 420Sstevel@tonic-gate #endif /* MIN */ 430Sstevel@tonic-gate 44*2705Sjp161948 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 450Sstevel@tonic-gate 46*2705Sjp161948 #define LOCALE_PATH "/usr/bin/locale" 470Sstevel@tonic-gate 48*2705Sjp161948 /* two-char country code, '-' and two-char region code */ 49*2705Sjp161948 #define LANGTAG_MAX 5 500Sstevel@tonic-gate 51*2705Sjp161948 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, 52*2705Sjp161948 uint_t len, uint_t *outlen, int *err, uchar_t **err_str); 530Sstevel@tonic-gate 540Sstevel@tonic-gate static int locale_cmp(const void *d1, const void *d2); 550Sstevel@tonic-gate static char *g11n_locale2langtag(char *locale); 560Sstevel@tonic-gate 57*2705Sjp161948 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str); 58*2705Sjp161948 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str); 590Sstevel@tonic-gate 60*2705Sjp161948 static char * 610Sstevel@tonic-gate g11n_locale2langtag(char *locale) 620Sstevel@tonic-gate { 63*2705Sjp161948 char *langtag; 640Sstevel@tonic-gate 65*2705Sjp161948 /* base cases */ 66*2705Sjp161948 if (!locale || !*locale) 67*2705Sjp161948 return (NULL); 680Sstevel@tonic-gate 69*2705Sjp161948 if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0) 70*2705Sjp161948 return ("i-default"); 710Sstevel@tonic-gate 72*2705Sjp161948 /* punt for language codes which are not exactly 2 letters */ 73*2705Sjp161948 if (strlen(locale) < 2 || 74*2705Sjp161948 !isalpha(locale[0]) || 75*2705Sjp161948 !isalpha(locale[1]) || 76*2705Sjp161948 (locale[2] != '\0' && 77*2705Sjp161948 locale[2] != '_' && 78*2705Sjp161948 locale[2] != '.' && 79*2705Sjp161948 locale[2] != '@')) 80*2705Sjp161948 return (NULL); 810Sstevel@tonic-gate 820Sstevel@tonic-gate 83*2705Sjp161948 /* we have a primary language sub-tag */ 84*2705Sjp161948 langtag = (char *)xmalloc(LANGTAG_MAX + 1); 850Sstevel@tonic-gate 86*2705Sjp161948 strncpy(langtag, locale, 2); 87*2705Sjp161948 langtag[2] = '\0'; 880Sstevel@tonic-gate 89*2705Sjp161948 /* do we have country sub-tag? For example: cs_CZ */ 90*2705Sjp161948 if (locale[2] == '_') { 91*2705Sjp161948 if (strlen(locale) < 5 || 92*2705Sjp161948 !isalpha(locale[3]) || 93*2705Sjp161948 !isalpha(locale[4]) || 94*2705Sjp161948 (locale[5] != '\0' && (locale[5] != '.' && 95*2705Sjp161948 locale[5] != '@'))) { 96*2705Sjp161948 return (langtag); 97*2705Sjp161948 } 98*2705Sjp161948 99*2705Sjp161948 /* example: create cs-CZ from cs_CZ */ 100*2705Sjp161948 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2, 101*2705Sjp161948 locale + 3) == 5) 102*2705Sjp161948 return (langtag); 1030Sstevel@tonic-gate } 1040Sstevel@tonic-gate 105*2705Sjp161948 /* in all other cases we just use the primary language sub-tag */ 106*2705Sjp161948 return (langtag); 1070Sstevel@tonic-gate } 1080Sstevel@tonic-gate 109*2705Sjp161948 uint_t 1100Sstevel@tonic-gate g11n_langtag_is_default(char *langtag) 1110Sstevel@tonic-gate { 112*2705Sjp161948 return (strcmp(langtag, "i-default") == 0); 1130Sstevel@tonic-gate } 1140Sstevel@tonic-gate 1150Sstevel@tonic-gate /* 1160Sstevel@tonic-gate * This lang tag / locale matching function works only for two-character 1170Sstevel@tonic-gate * language primary sub-tags and two-character country sub-tags. 1180Sstevel@tonic-gate */ 119*2705Sjp161948 uint_t 1200Sstevel@tonic-gate g11n_langtag_matches_locale(char *langtag, char *locale) 1210Sstevel@tonic-gate { 122*2705Sjp161948 /* match "i-default" to the process' current locale if possible */ 123*2705Sjp161948 if (g11n_langtag_is_default(langtag)) { 124*2705Sjp161948 if (strcasecmp(locale, "POSIX") == 0 || 125*2705Sjp161948 strcasecmp(locale, "C") == 0) 126*2705Sjp161948 return (1); 127*2705Sjp161948 else 128*2705Sjp161948 return (0); 129*2705Sjp161948 } 1300Sstevel@tonic-gate 131*2705Sjp161948 /* 132*2705Sjp161948 * locale must be at least 2 chars long and the lang part must be 133*2705Sjp161948 * exactly two characters 134*2705Sjp161948 */ 135*2705Sjp161948 if (strlen(locale) < 2 || 136*2705Sjp161948 (!isalpha(locale[0]) || !isalpha(locale[1]) || 137*2705Sjp161948 (locale[2] != '\0' && locale[2] != '_' && 138*2705Sjp161948 locale[2] != '.' && locale[2] != '@'))) 139*2705Sjp161948 return (0); 1400Sstevel@tonic-gate 141*2705Sjp161948 /* same thing with the langtag */ 142*2705Sjp161948 if (strlen(langtag) < 2 || 143*2705Sjp161948 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 144*2705Sjp161948 (langtag[2] != '\0' && langtag[2] != '-'))) 145*2705Sjp161948 return (0); 1460Sstevel@tonic-gate 147*2705Sjp161948 /* primary language sub-tag and the locale's language part must match */ 148*2705Sjp161948 if (strncasecmp(langtag, locale, 2) != 0) 149*2705Sjp161948 return (0); 1500Sstevel@tonic-gate 151*2705Sjp161948 /* 152*2705Sjp161948 * primary language sub-tag and the locale's language match, now 153*2705Sjp161948 * fuzzy check country part 154*2705Sjp161948 */ 1550Sstevel@tonic-gate 156*2705Sjp161948 /* neither langtag nor locale have more than one component */ 157*2705Sjp161948 if (langtag[2] == '\0' && 158*2705Sjp161948 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 159*2705Sjp161948 return (2); 1600Sstevel@tonic-gate 161*2705Sjp161948 /* langtag has only one sub-tag... */ 162*2705Sjp161948 if (langtag[2] == '\0') 163*2705Sjp161948 return (1); 1640Sstevel@tonic-gate 165*2705Sjp161948 /* locale has no country code... */ 166*2705Sjp161948 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 167*2705Sjp161948 return (1); 168*2705Sjp161948 169*2705Sjp161948 /* langtag has more than one subtag and the locale has a country code */ 1700Sstevel@tonic-gate 171*2705Sjp161948 /* ignore second subtag if not two chars */ 172*2705Sjp161948 if (strlen(langtag) < 5) 173*2705Sjp161948 return (1); 1740Sstevel@tonic-gate 175*2705Sjp161948 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 176*2705Sjp161948 (langtag[5] != '\0' && langtag[5] != '-')) 177*2705Sjp161948 return (1); 1780Sstevel@tonic-gate 179*2705Sjp161948 /* ignore rest of locale if there is no two-character country part */ 180*2705Sjp161948 if (strlen(locale) < 5) 181*2705Sjp161948 return (1); 1820Sstevel@tonic-gate 183*2705Sjp161948 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 184*2705Sjp161948 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 185*2705Sjp161948 return (1); 1860Sstevel@tonic-gate 187*2705Sjp161948 /* if the country part matches, return 2 */ 188*2705Sjp161948 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 189*2705Sjp161948 return (2); 1900Sstevel@tonic-gate 191*2705Sjp161948 return (1); 1920Sstevel@tonic-gate } 1930Sstevel@tonic-gate 1940Sstevel@tonic-gate char * 1950Sstevel@tonic-gate g11n_getlocale() 1960Sstevel@tonic-gate { 197*2705Sjp161948 /* we have one text domain - always set it */ 198*2705Sjp161948 (void) textdomain(TEXT_DOMAIN); 1990Sstevel@tonic-gate 200*2705Sjp161948 /* if the locale is not set, set it from the env vars */ 201*2705Sjp161948 if (!setlocale(LC_MESSAGES, NULL)) 202*2705Sjp161948 (void) setlocale(LC_MESSAGES, ""); 2030Sstevel@tonic-gate 204*2705Sjp161948 return (setlocale(LC_MESSAGES, NULL)); 2050Sstevel@tonic-gate } 2060Sstevel@tonic-gate 2070Sstevel@tonic-gate void 2080Sstevel@tonic-gate g11n_setlocale(int category, const char *locale) 2090Sstevel@tonic-gate { 210*2705Sjp161948 char *curr; 2110Sstevel@tonic-gate 212*2705Sjp161948 /* we have one text domain - always set it */ 213*2705Sjp161948 (void) textdomain(TEXT_DOMAIN); 2140Sstevel@tonic-gate 215*2705Sjp161948 if (!locale) 216*2705Sjp161948 return; 2170Sstevel@tonic-gate 218*2705Sjp161948 if (*locale && ((curr = setlocale(category, NULL))) && 219*2705Sjp161948 strcmp(curr, locale) == 0) 220*2705Sjp161948 return; 2212628Sjp161948 222*2705Sjp161948 /* if <category> is bogus, setlocale() will do nothing */ 223*2705Sjp161948 (void) setlocale(category, locale); 2240Sstevel@tonic-gate } 2250Sstevel@tonic-gate 2260Sstevel@tonic-gate char ** 2270Sstevel@tonic-gate g11n_getlocales() 2280Sstevel@tonic-gate { 229*2705Sjp161948 FILE *locale_out; 230*2705Sjp161948 uint_t n_elems, list_size, long_line = 0; 231*2705Sjp161948 char **list; 232*2705Sjp161948 char locale[64]; /* 64 bytes is plenty for locale names */ 233*2705Sjp161948 234*2705Sjp161948 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) 235*2705Sjp161948 return (NULL); 2360Sstevel@tonic-gate 237*2705Sjp161948 /* 238*2705Sjp161948 * start with enough room for 65 locales - that's a lot fewer than 239*2705Sjp161948 * all the locales available for installation, but a lot more than 240*2705Sjp161948 * what most users will need and install 241*2705Sjp161948 */ 242*2705Sjp161948 n_elems = 0; 243*2705Sjp161948 list_size = 192; 244*2705Sjp161948 list = (char **) xmalloc(sizeof (char *) * (list_size + 1)); 245*2705Sjp161948 memset(list, 0, sizeof (char *) * (list_size + 1)); 2460Sstevel@tonic-gate 247*2705Sjp161948 while (fgets(locale, sizeof (locale), locale_out)) { 248*2705Sjp161948 /* skip long locale names (if any) */ 249*2705Sjp161948 if (!strchr(locale, '\n')) { 250*2705Sjp161948 long_line = 1; 251*2705Sjp161948 continue; 252*2705Sjp161948 } else if (long_line) { 253*2705Sjp161948 long_line = 0; 254*2705Sjp161948 continue; 255*2705Sjp161948 } 2560Sstevel@tonic-gate 257*2705Sjp161948 if (strncmp(locale, "iso_8859", 8) == 0) 258*2705Sjp161948 /* ignore locale names like "iso_8859-1" */ 259*2705Sjp161948 continue; 2600Sstevel@tonic-gate 261*2705Sjp161948 if (n_elems == list_size) { 262*2705Sjp161948 list_size *= 2; 263*2705Sjp161948 list = (char **)xrealloc((void *) list, 264*2705Sjp161948 (list_size + 1) * sizeof (char *)); 265*2705Sjp161948 memset(&list[n_elems + 1], 0, 266*2705Sjp161948 sizeof (char *) * (list_size - n_elems + 1)); 267*2705Sjp161948 } 268*2705Sjp161948 269*2705Sjp161948 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 270*2705Sjp161948 list[n_elems++] = xstrdup(locale); 2710Sstevel@tonic-gate } 2720Sstevel@tonic-gate 273*2705Sjp161948 list[n_elems] = NULL; 274*2705Sjp161948 (void) pclose(locale_out); 2750Sstevel@tonic-gate 276*2705Sjp161948 qsort(list, n_elems - 1, sizeof (char *), locale_cmp); 277*2705Sjp161948 return (list); 2780Sstevel@tonic-gate } 2790Sstevel@tonic-gate 2800Sstevel@tonic-gate char * 2810Sstevel@tonic-gate g11n_getlangs() 2820Sstevel@tonic-gate { 283*2705Sjp161948 char *locale; 2840Sstevel@tonic-gate 285*2705Sjp161948 if (getenv("SSH_LANGS")) 286*2705Sjp161948 return (xstrdup(getenv("SSH_LANGS"))); 2870Sstevel@tonic-gate 288*2705Sjp161948 locale = g11n_getlocale(); 2890Sstevel@tonic-gate 290*2705Sjp161948 if (!locale || !*locale) 291*2705Sjp161948 return (xstrdup("i-default")); 2920Sstevel@tonic-gate 293*2705Sjp161948 return (g11n_locale2langtag(locale)); 2940Sstevel@tonic-gate } 2950Sstevel@tonic-gate 2960Sstevel@tonic-gate char * 2970Sstevel@tonic-gate g11n_locales2langs(char **locale_set) 2980Sstevel@tonic-gate { 299*2705Sjp161948 char **p, **r, **q; 300*2705Sjp161948 char *langtag; 301*2705Sjp161948 int locales, skip; 3020Sstevel@tonic-gate 303*2705Sjp161948 for (locales = 0, p = locale_set; p && *p; p++) 304*2705Sjp161948 locales++; 3050Sstevel@tonic-gate 306*2705Sjp161948 r = (char **)xmalloc((locales + 1) * sizeof (char *)); 307*2705Sjp161948 memset(r, 0, (locales + 1) * sizeof (char *)); 3080Sstevel@tonic-gate 309*2705Sjp161948 for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) { 310*2705Sjp161948 skip = 0; 311*2705Sjp161948 if ((langtag = g11n_locale2langtag(*p)) == NULL) 312*2705Sjp161948 continue; 313*2705Sjp161948 for (q = r; (q - r) < locales; q++) { 314*2705Sjp161948 if (!*q) 315*2705Sjp161948 break; 316*2705Sjp161948 if (*q && strcmp(*q, langtag) == 0) 317*2705Sjp161948 skip = 1; 318*2705Sjp161948 } 319*2705Sjp161948 if (!skip) 320*2705Sjp161948 *(q++) = langtag; 321*2705Sjp161948 *q = NULL; 3220Sstevel@tonic-gate } 323*2705Sjp161948 324*2705Sjp161948 return (xjoin(r, ',')); 3250Sstevel@tonic-gate } 3260Sstevel@tonic-gate 327*2705Sjp161948 static int 3280Sstevel@tonic-gate sortcmp(const void *d1, const void *d2) 3290Sstevel@tonic-gate { 330*2705Sjp161948 char *s1 = *(char **)d1; 331*2705Sjp161948 char *s2 = *(char **)d2; 3320Sstevel@tonic-gate 333*2705Sjp161948 return (strcmp(s1, s2)); 3340Sstevel@tonic-gate } 3350Sstevel@tonic-gate 3360Sstevel@tonic-gate int 3370Sstevel@tonic-gate g11n_langtag_match(char *langtag1, char *langtag2) 3380Sstevel@tonic-gate { 339*2705Sjp161948 int len1, len2; 340*2705Sjp161948 char c1, c2; 3410Sstevel@tonic-gate 342*2705Sjp161948 len1 = (strchr(langtag1, '-')) ? 3430Sstevel@tonic-gate (strchr(langtag1, '-') - langtag1) 3440Sstevel@tonic-gate : strlen(langtag1); 3450Sstevel@tonic-gate 346*2705Sjp161948 len2 = (strchr(langtag2, '-')) ? 3470Sstevel@tonic-gate (strchr(langtag2, '-') - langtag2) 3480Sstevel@tonic-gate : strlen(langtag2); 3490Sstevel@tonic-gate 350*2705Sjp161948 /* no match */ 351*2705Sjp161948 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 352*2705Sjp161948 return (0); 3530Sstevel@tonic-gate 354*2705Sjp161948 c1 = *(langtag1 + len1); 355*2705Sjp161948 c2 = *(langtag2 + len2); 3560Sstevel@tonic-gate 357*2705Sjp161948 /* no country sub-tags - exact match */ 358*2705Sjp161948 if (c1 == '\0' && c2 == '\0') 359*2705Sjp161948 return (2); 3600Sstevel@tonic-gate 361*2705Sjp161948 /* one langtag has a country sub-tag, the other doesn't */ 362*2705Sjp161948 if (c1 == '\0' || c2 == '\0') 363*2705Sjp161948 return (1); 3640Sstevel@tonic-gate 365*2705Sjp161948 /* can't happen - both langtags have a country sub-tag */ 366*2705Sjp161948 if (c1 != '-' || c2 != '-') 367*2705Sjp161948 return (1); 3680Sstevel@tonic-gate 369*2705Sjp161948 /* compare country subtags */ 370*2705Sjp161948 langtag1 = langtag1 + len1 + 1; 371*2705Sjp161948 langtag2 = langtag2 + len2 + 1; 3720Sstevel@tonic-gate 373*2705Sjp161948 len1 = (strchr(langtag1, '-')) ? 374*2705Sjp161948 (strchr(langtag1, '-') - langtag1) : strlen(langtag1); 375*2705Sjp161948 376*2705Sjp161948 len2 = (strchr(langtag2, '-')) ? 377*2705Sjp161948 (strchr(langtag2, '-') - langtag2) : strlen(langtag2); 3780Sstevel@tonic-gate 379*2705Sjp161948 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 380*2705Sjp161948 return (1); 3810Sstevel@tonic-gate 382*2705Sjp161948 /* country tags matched - exact match */ 383*2705Sjp161948 return (2); 3840Sstevel@tonic-gate } 3850Sstevel@tonic-gate 3860Sstevel@tonic-gate char * 3870Sstevel@tonic-gate g11n_langtag_set_intersect(char *set1, char *set2) 3880Sstevel@tonic-gate { 389*2705Sjp161948 char **list1, **list2, **list3, **p, **q, **r; 390*2705Sjp161948 char *set3, *lang_subtag; 391*2705Sjp161948 uint_t n1, n2, n3; 392*2705Sjp161948 uint_t do_append; 393*2705Sjp161948 394*2705Sjp161948 list1 = xsplit(set1, ','); 395*2705Sjp161948 list2 = xsplit(set2, ','); 3960Sstevel@tonic-gate 397*2705Sjp161948 for (n1 = 0, p = list1; p && *p; p++, n1++) 398*2705Sjp161948 ; 399*2705Sjp161948 for (n2 = 0, p = list2; p && *p; p++, n2++) 400*2705Sjp161948 ; 4010Sstevel@tonic-gate 402*2705Sjp161948 list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1)); 403*2705Sjp161948 *list3 = NULL; 4040Sstevel@tonic-gate 405*2705Sjp161948 /* 406*2705Sjp161948 * we must not sort the user langtags - sorting or not the server's 407*2705Sjp161948 * should not affect the outcome 408*2705Sjp161948 */ 409*2705Sjp161948 qsort(list2, n2, sizeof (char *), sortcmp); 4100Sstevel@tonic-gate 411*2705Sjp161948 for (n3 = 0, p = list1; p && *p; p++) { 412*2705Sjp161948 do_append = 0; 413*2705Sjp161948 for (q = list2; q && *q; q++) { 414*2705Sjp161948 if (g11n_langtag_match(*p, *q) != 2) continue; 415*2705Sjp161948 /* append element */ 416*2705Sjp161948 for (r = list3; (r - list3) <= (n1 + n2); r++) { 417*2705Sjp161948 do_append = 1; 418*2705Sjp161948 if (!*r) 419*2705Sjp161948 break; 420*2705Sjp161948 if (strcmp(*p, *r) == 0) { 421*2705Sjp161948 do_append = 0; 422*2705Sjp161948 break; 423*2705Sjp161948 } 424*2705Sjp161948 } 425*2705Sjp161948 if (do_append && n3 <= (n1 + n2)) { 426*2705Sjp161948 list3[n3++] = xstrdup(*p); 427*2705Sjp161948 list3[n3] = NULL; 428*2705Sjp161948 } 4290Sstevel@tonic-gate } 4300Sstevel@tonic-gate } 431*2705Sjp161948 432*2705Sjp161948 for (p = list1; p && *p; p++) { 433*2705Sjp161948 do_append = 0; 434*2705Sjp161948 for (q = list2; q && *q; q++) { 435*2705Sjp161948 if (g11n_langtag_match(*p, *q) != 1) 436*2705Sjp161948 continue; 4370Sstevel@tonic-gate 438*2705Sjp161948 /* append element */ 439*2705Sjp161948 lang_subtag = xstrdup(*p); 440*2705Sjp161948 if (strchr(lang_subtag, '-')) 441*2705Sjp161948 *(strchr(lang_subtag, '-')) = '\0'; 442*2705Sjp161948 for (r = list3; (r - list3) <= (n1 + n2); r++) { 443*2705Sjp161948 do_append = 1; 444*2705Sjp161948 if (!*r) 445*2705Sjp161948 break; 446*2705Sjp161948 if (strcmp(lang_subtag, *r) == 0) { 447*2705Sjp161948 do_append = 0; 448*2705Sjp161948 break; 449*2705Sjp161948 } 450*2705Sjp161948 } 451*2705Sjp161948 if (do_append && n3 <= (n1 + n2)) { 452*2705Sjp161948 list3[n3++] = lang_subtag; 453*2705Sjp161948 list3[n3] = NULL; 454*2705Sjp161948 } else 455*2705Sjp161948 xfree(lang_subtag); 4560Sstevel@tonic-gate } 4570Sstevel@tonic-gate } 4580Sstevel@tonic-gate 459*2705Sjp161948 set3 = xjoin(list3, ','); 460*2705Sjp161948 xfree_split_list(list1); 461*2705Sjp161948 xfree_split_list(list2); 462*2705Sjp161948 xfree_split_list(list3); 4630Sstevel@tonic-gate 464*2705Sjp161948 return (set3); 4650Sstevel@tonic-gate } 4660Sstevel@tonic-gate 4670Sstevel@tonic-gate char * 4680Sstevel@tonic-gate g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 4690Sstevel@tonic-gate { 470*2705Sjp161948 char *list, *result; 471*2705Sjp161948 char **xlist; 4720Sstevel@tonic-gate 473*2705Sjp161948 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 474*2705Sjp161948 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 4750Sstevel@tonic-gate 476*2705Sjp161948 if (!list) 477*2705Sjp161948 return (NULL); 4780Sstevel@tonic-gate 479*2705Sjp161948 xlist = xsplit(list, ','); 4800Sstevel@tonic-gate 481*2705Sjp161948 xfree(list); 4820Sstevel@tonic-gate 483*2705Sjp161948 if (!xlist || !*xlist) 484*2705Sjp161948 return (NULL); 4850Sstevel@tonic-gate 486*2705Sjp161948 result = xstrdup(*xlist); 487*2705Sjp161948 xfree_split_list(xlist); 4880Sstevel@tonic-gate 489*2705Sjp161948 return (result); 4900Sstevel@tonic-gate } 4910Sstevel@tonic-gate 4920Sstevel@tonic-gate /* 4930Sstevel@tonic-gate * Compare locales, preferring UTF-8 codesets to others, otherwise doing 4940Sstevel@tonic-gate * a stright strcmp() 4950Sstevel@tonic-gate */ 496*2705Sjp161948 static int 4970Sstevel@tonic-gate locale_cmp(const void *d1, const void *d2) 4980Sstevel@tonic-gate { 499*2705Sjp161948 char *dot_ptr; 500*2705Sjp161948 char *s1 = *(char **)d1; 501*2705Sjp161948 char *s2 = *(char **)d2; 502*2705Sjp161948 int s1_is_utf8 = 0; 503*2705Sjp161948 int s2_is_utf8 = 0; 5040Sstevel@tonic-gate 505*2705Sjp161948 /* check if s1 is a UTF-8 locale */ 506*2705Sjp161948 if (((dot_ptr = strchr((char *)s1, '.')) != NULL) && 507*2705Sjp161948 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 508*2705Sjp161948 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 509*2705Sjp161948 s1_is_utf8++; 510*2705Sjp161948 } 511*2705Sjp161948 512*2705Sjp161948 /* check if s2 is a UTF-8 locale */ 513*2705Sjp161948 if (((dot_ptr = strchr((char *)s2, '.')) != NULL) && 514*2705Sjp161948 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 515*2705Sjp161948 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 516*2705Sjp161948 s2_is_utf8++; 517*2705Sjp161948 } 5180Sstevel@tonic-gate 519*2705Sjp161948 /* prefer UTF-8 locales */ 520*2705Sjp161948 if (s1_is_utf8 && !s2_is_utf8) 521*2705Sjp161948 return (-1); 5220Sstevel@tonic-gate 523*2705Sjp161948 if (s2_is_utf8 && !s1_is_utf8) 524*2705Sjp161948 return (1); 5250Sstevel@tonic-gate 526*2705Sjp161948 /* prefer any locale over the default locales */ 527*2705Sjp161948 if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 || 528*2705Sjp161948 strcmp(s1, "common") == 0) { 529*2705Sjp161948 if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 && 530*2705Sjp161948 strcmp(s2, "common") != 0) 531*2705Sjp161948 return (1); 532*2705Sjp161948 } 5330Sstevel@tonic-gate 534*2705Sjp161948 if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 || 535*2705Sjp161948 strcmp(s2, "common") == 0) { 536*2705Sjp161948 if (strcmp(s1, "C") != 0 && 537*2705Sjp161948 strcmp(s1, "POSIX") != 0 && 538*2705Sjp161948 strcmp(s1, "common") != 0) 539*2705Sjp161948 return (-1); 540*2705Sjp161948 } 5410Sstevel@tonic-gate 542*2705Sjp161948 return (strcmp(s1, s2)); 5430Sstevel@tonic-gate } 5440Sstevel@tonic-gate 5450Sstevel@tonic-gate 5460Sstevel@tonic-gate char ** 547*2705Sjp161948 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set) 5480Sstevel@tonic-gate { 549*2705Sjp161948 char **langtag_list, **result, **p, **q, **r; 550*2705Sjp161948 char *s; 551*2705Sjp161948 uint_t do_append, n_langtags, n_locales, n_results, max_results; 552*2705Sjp161948 553*2705Sjp161948 /* count lang tags and locales */ 554*2705Sjp161948 for (n_locales = 0, p = locale_set; p && *p; p++) 555*2705Sjp161948 n_locales++; 5560Sstevel@tonic-gate 557*2705Sjp161948 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 558*2705Sjp161948 /* count the number of langtags */ 559*2705Sjp161948 for (; s = strchr(s, ','); s++, n_langtags++) 560*2705Sjp161948 ; 561*2705Sjp161948 562*2705Sjp161948 qsort(locale_set, n_locales, sizeof (char *), locale_cmp); 5630Sstevel@tonic-gate 564*2705Sjp161948 langtag_list = xsplit(langtag_set, ','); 565*2705Sjp161948 for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++) 566*2705Sjp161948 ; 5670Sstevel@tonic-gate 568*2705Sjp161948 max_results = MIN(n_locales, n_langtags) * 2; 569*2705Sjp161948 result = (char **) xmalloc(sizeof (char *) * (max_results + 1)); 570*2705Sjp161948 *result = NULL; 571*2705Sjp161948 n_results = 0; 5720Sstevel@tonic-gate 573*2705Sjp161948 /* more specific matches first */ 574*2705Sjp161948 for (p = langtag_list; p && *p; p++) { 575*2705Sjp161948 do_append = 0; 576*2705Sjp161948 for (q = locale_set; q && *q; q++) { 577*2705Sjp161948 if (g11n_langtag_matches_locale(*p, *q) == 2) { 578*2705Sjp161948 do_append = 1; 579*2705Sjp161948 for (r = result; (r - result) <= 580*2705Sjp161948 MIN(n_locales, n_langtags); r++) { 581*2705Sjp161948 if (!*r) 582*2705Sjp161948 break; 583*2705Sjp161948 if (strcmp(*q, *r) == 0) { 584*2705Sjp161948 do_append = 0; 585*2705Sjp161948 break; 586*2705Sjp161948 } 587*2705Sjp161948 } 588*2705Sjp161948 if (do_append && n_results < max_results) { 589*2705Sjp161948 result[n_results++] = xstrdup(*q); 590*2705Sjp161948 result[n_results] = NULL; 591*2705Sjp161948 } 592*2705Sjp161948 break; 593*2705Sjp161948 } 5940Sstevel@tonic-gate } 5950Sstevel@tonic-gate } 5960Sstevel@tonic-gate 597*2705Sjp161948 for (p = langtag_list; p && *p; p++) { 598*2705Sjp161948 do_append = 0; 599*2705Sjp161948 for (q = locale_set; q && *q; q++) { 600*2705Sjp161948 if (g11n_langtag_matches_locale(*p, *q) == 1) { 601*2705Sjp161948 do_append = 1; 602*2705Sjp161948 for (r = result; (r - result) <= 603*2705Sjp161948 MIN(n_locales, n_langtags); r++) { 604*2705Sjp161948 if (!*r) 605*2705Sjp161948 break; 606*2705Sjp161948 if (strcmp(*q, *r) == 0) { 607*2705Sjp161948 do_append = 0; 608*2705Sjp161948 break; 609*2705Sjp161948 } 610*2705Sjp161948 } 611*2705Sjp161948 if (do_append && n_results < max_results) { 612*2705Sjp161948 result[n_results++] = xstrdup(*q); 613*2705Sjp161948 result[n_results] = NULL; 614*2705Sjp161948 } 615*2705Sjp161948 break; 616*2705Sjp161948 } 6170Sstevel@tonic-gate } 6180Sstevel@tonic-gate } 6190Sstevel@tonic-gate 620*2705Sjp161948 xfree_split_list(langtag_list); 621*2705Sjp161948 622*2705Sjp161948 return (result); 6230Sstevel@tonic-gate } 6240Sstevel@tonic-gate 6250Sstevel@tonic-gate char * 6260Sstevel@tonic-gate g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 6270Sstevel@tonic-gate { 628*2705Sjp161948 char **results, *result = NULL; 6290Sstevel@tonic-gate 630*2705Sjp161948 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 6310Sstevel@tonic-gate srvr_locales ? srvr_locales : g11n_getlocales())) == NULL) 632*2705Sjp161948 return (NULL); 6330Sstevel@tonic-gate 634*2705Sjp161948 if (*results != NULL) 635*2705Sjp161948 result = xstrdup(*results); 6360Sstevel@tonic-gate 637*2705Sjp161948 xfree_split_list(results); 6380Sstevel@tonic-gate 639*2705Sjp161948 return (result); 6400Sstevel@tonic-gate } 6410Sstevel@tonic-gate 6420Sstevel@tonic-gate 6430Sstevel@tonic-gate /* 6440Sstevel@tonic-gate * Functions for validating ASCII and UTF-8 strings 6450Sstevel@tonic-gate * 6460Sstevel@tonic-gate * The error_str parameter is an optional pointer to a char variable 6470Sstevel@tonic-gate * where to store a string suitable for use with error() or fatal() or 6480Sstevel@tonic-gate * friends. 6490Sstevel@tonic-gate * 6500Sstevel@tonic-gate * The return value is 0 if success, EILSEQ or EINVAL. 6510Sstevel@tonic-gate * 6520Sstevel@tonic-gate */ 653*2705Sjp161948 uint_t 654*2705Sjp161948 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str) 6550Sstevel@tonic-gate { 656*2705Sjp161948 uchar_t *p; 6570Sstevel@tonic-gate 658*2705Sjp161948 for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++) 659*2705Sjp161948 ; 6600Sstevel@tonic-gate 661*2705Sjp161948 if (len && ((p - (uchar_t *)str) != len)) 662*2705Sjp161948 return (EILSEQ); 663*2705Sjp161948 664*2705Sjp161948 return (0); 6650Sstevel@tonic-gate } 6660Sstevel@tonic-gate 667*2705Sjp161948 uint_t 668*2705Sjp161948 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str) 6690Sstevel@tonic-gate { 670*2705Sjp161948 uchar_t *p; 671*2705Sjp161948 uint_t c, l; 672*2705Sjp161948 673*2705Sjp161948 if (len == 0) 674*2705Sjp161948 len = strlen((const char *)str); 6750Sstevel@tonic-gate 676*2705Sjp161948 for (p = (uchar_t *)str; p && (p - str < len) && *p; ) { 677*2705Sjp161948 /* 8-bit chars begin a UTF-8 sequence */ 678*2705Sjp161948 if (*p & 0x80) { 679*2705Sjp161948 /* get sequence length and sanity check first byte */ 680*2705Sjp161948 if (*p < 0xc0) 681*2705Sjp161948 return (EILSEQ); 682*2705Sjp161948 else if (*p < 0xe0) 683*2705Sjp161948 l = 2; 684*2705Sjp161948 else if (*p < 0xf0) 685*2705Sjp161948 l = 3; 686*2705Sjp161948 else if (*p < 0xf8) 687*2705Sjp161948 l = 4; 688*2705Sjp161948 else if (*p < 0xfc) 689*2705Sjp161948 l = 5; 690*2705Sjp161948 else if (*p < 0xfe) 691*2705Sjp161948 l = 6; 692*2705Sjp161948 else 693*2705Sjp161948 return (EILSEQ); 694*2705Sjp161948 695*2705Sjp161948 if ((p + l - str) >= len) 696*2705Sjp161948 return (EILSEQ); 697*2705Sjp161948 698*2705Sjp161948 /* overlong detection - build codepoint */ 699*2705Sjp161948 c = *p & 0x3f; 700*2705Sjp161948 /* shift c bits from first byte */ 701*2705Sjp161948 c = c << (6 * (l - 1)); 702*2705Sjp161948 703*2705Sjp161948 if (l > 1) { 704*2705Sjp161948 if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80)) 705*2705Sjp161948 c = c | ((*(p + 1) & 0x3f) << 706*2705Sjp161948 (6 * (l - 2))); 707*2705Sjp161948 else 708*2705Sjp161948 return (EILSEQ); 709*2705Sjp161948 710*2705Sjp161948 if (c < 0x80) 711*2705Sjp161948 return (EILSEQ); 712*2705Sjp161948 } 7130Sstevel@tonic-gate 714*2705Sjp161948 if (l > 2) { 715*2705Sjp161948 if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80)) 716*2705Sjp161948 c = c | ((*(p + 2) & 0x3f) << 717*2705Sjp161948 (6 * (l - 3))); 718*2705Sjp161948 else 719*2705Sjp161948 return (EILSEQ); 720*2705Sjp161948 721*2705Sjp161948 if (c < 0x800) 722*2705Sjp161948 return (EILSEQ); 723*2705Sjp161948 } 724*2705Sjp161948 725*2705Sjp161948 if (l > 3) { 726*2705Sjp161948 if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80)) 727*2705Sjp161948 c = c | ((*(p + 3) & 0x3f) << 728*2705Sjp161948 (6 * (l - 4))); 729*2705Sjp161948 else 730*2705Sjp161948 return (EILSEQ); 731*2705Sjp161948 732*2705Sjp161948 if (c < 0x10000) 733*2705Sjp161948 return (EILSEQ); 734*2705Sjp161948 } 7350Sstevel@tonic-gate 736*2705Sjp161948 if (l > 4) { 737*2705Sjp161948 if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80)) 738*2705Sjp161948 c = c | ((*(p + 4) & 0x3f) << 739*2705Sjp161948 (6 * (l - 5))); 740*2705Sjp161948 else 741*2705Sjp161948 return (EILSEQ); 742*2705Sjp161948 743*2705Sjp161948 if (c < 0x200000) 744*2705Sjp161948 return (EILSEQ); 745*2705Sjp161948 } 7460Sstevel@tonic-gate 747*2705Sjp161948 if (l > 5) { 748*2705Sjp161948 if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80)) 749*2705Sjp161948 c = c | (*(p + 5) & 0x3f); 750*2705Sjp161948 else 751*2705Sjp161948 return (EILSEQ); 752*2705Sjp161948 753*2705Sjp161948 if (c < 0x4000000) 754*2705Sjp161948 return (EILSEQ); 755*2705Sjp161948 } 7560Sstevel@tonic-gate 757*2705Sjp161948 /* 758*2705Sjp161948 * check for UTF-16 surrogates ifs other illegal 759*2705Sjp161948 * UTF-8 * points 760*2705Sjp161948 */ 761*2705Sjp161948 if (((c <= 0xdfff) && (c >= 0xd800)) || 762*2705Sjp161948 (c == 0xfffe) || (c == 0xffff)) 763*2705Sjp161948 return (EILSEQ); 764*2705Sjp161948 p += l; 765*2705Sjp161948 } 766*2705Sjp161948 /* 7-bit chars are fine */ 7670Sstevel@tonic-gate else 768*2705Sjp161948 p++; 7690Sstevel@tonic-gate } 770*2705Sjp161948 return (0); 7710Sstevel@tonic-gate } 7720Sstevel@tonic-gate 7730Sstevel@tonic-gate /* 7740Sstevel@tonic-gate * Functions for converting to ASCII or UTF-8 from the local codeset 7750Sstevel@tonic-gate * Functions for converting from ASCII or UTF-8 to the local codeset 7760Sstevel@tonic-gate * 7770Sstevel@tonic-gate * The error_str parameter is an optional pointer to a char variable 7780Sstevel@tonic-gate * where to store a string suitable for use with error() or fatal() or 7790Sstevel@tonic-gate * friends. 7800Sstevel@tonic-gate * 7810Sstevel@tonic-gate * The err parameter is an optional pointer to an integer where 0 7820Sstevel@tonic-gate * (success) or EILSEQ or EINVAL will be stored (failure). 7830Sstevel@tonic-gate * 7840Sstevel@tonic-gate * These functions return NULL if the conversion fails. 7850Sstevel@tonic-gate * 7860Sstevel@tonic-gate */ 787*2705Sjp161948 uchar_t * 788*2705Sjp161948 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str) 7890Sstevel@tonic-gate { 790*2705Sjp161948 static uint_t initialized = 0; 791*2705Sjp161948 static uint_t do_convert = 0; 792*2705Sjp161948 iconv_t cd; 793*2705Sjp161948 int err; 7940Sstevel@tonic-gate 795*2705Sjp161948 if (!initialized) { 796*2705Sjp161948 /* 797*2705Sjp161948 * iconv_open() fails if the to/from codesets are the 798*2705Sjp161948 * same, and there are aliases of codesets to boot... 799*2705Sjp161948 */ 800*2705Sjp161948 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 801*2705Sjp161948 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 802*2705Sjp161948 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 803*2705Sjp161948 initialized = 1; 804*2705Sjp161948 do_convert = 0; 805*2705Sjp161948 } else { 806*2705Sjp161948 cd = iconv_open(nl_langinfo(CODESET), "646"); 807*2705Sjp161948 if (cd == (iconv_t)-1) { 808*2705Sjp161948 if (err_ptr) 809*2705Sjp161948 *err_ptr = errno; 810*2705Sjp161948 if (error_str) 811*2705Sjp161948 *error_str = (uchar_t *)"Cannot " 812*2705Sjp161948 "convert ASCII strings to the local" 813*2705Sjp161948 " codeset"; 814*2705Sjp161948 } 815*2705Sjp161948 initialized = 1; 816*2705Sjp161948 do_convert = 1; 817*2705Sjp161948 } 8180Sstevel@tonic-gate } 819*2705Sjp161948 820*2705Sjp161948 if (!do_convert) { 821*2705Sjp161948 if ((err = g11n_validate_ascii(str, 0, error_str))) { 822*2705Sjp161948 if (err_ptr) 823*2705Sjp161948 *err_ptr = err; 824*2705Sjp161948 return (NULL); 825*2705Sjp161948 } else 826*2705Sjp161948 return ((uchar_t *)xstrdup(str)); 8270Sstevel@tonic-gate } 8280Sstevel@tonic-gate 829*2705Sjp161948 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 8300Sstevel@tonic-gate } 8310Sstevel@tonic-gate 832*2705Sjp161948 uchar_t * 833*2705Sjp161948 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 8340Sstevel@tonic-gate { 835*2705Sjp161948 static uint_t initialized = 0; 836*2705Sjp161948 static uint_t do_convert = 0; 837*2705Sjp161948 iconv_t cd; 838*2705Sjp161948 int err; 8390Sstevel@tonic-gate 840*2705Sjp161948 if (!initialized) { 841*2705Sjp161948 /* 842*2705Sjp161948 * iconv_open() fails if the to/from codesets are the 843*2705Sjp161948 * same, and there are aliases of codesets to boot... 844*2705Sjp161948 */ 845*2705Sjp161948 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 846*2705Sjp161948 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 847*2705Sjp161948 initialized = 1; 848*2705Sjp161948 do_convert = 0; 849*2705Sjp161948 } else { 850*2705Sjp161948 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 851*2705Sjp161948 if (cd == (iconv_t)-1) { 852*2705Sjp161948 if (err_ptr) 853*2705Sjp161948 *err_ptr = errno; 854*2705Sjp161948 if (error_str) 855*2705Sjp161948 *error_str = (uchar_t *)"Cannot " 856*2705Sjp161948 "convert UTF-8 strings to the " 857*2705Sjp161948 "local codeset"; 858*2705Sjp161948 } 859*2705Sjp161948 initialized = 1; 860*2705Sjp161948 do_convert = 1; 861*2705Sjp161948 } 8620Sstevel@tonic-gate } 863*2705Sjp161948 864*2705Sjp161948 if (!do_convert) { 865*2705Sjp161948 if ((err = g11n_validate_utf8(str, 0, error_str))) { 866*2705Sjp161948 if (err_ptr) 867*2705Sjp161948 *err_ptr = err; 868*2705Sjp161948 return (NULL); 869*2705Sjp161948 } else 870*2705Sjp161948 return ((uchar_t *)xstrdup((char *)str)); 8710Sstevel@tonic-gate } 8720Sstevel@tonic-gate 873*2705Sjp161948 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 8740Sstevel@tonic-gate } 8750Sstevel@tonic-gate 8760Sstevel@tonic-gate char * 877*2705Sjp161948 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str) 8780Sstevel@tonic-gate { 879*2705Sjp161948 static uint_t initialized = 0; 880*2705Sjp161948 static uint_t do_convert = 0; 881*2705Sjp161948 iconv_t cd; 8820Sstevel@tonic-gate 883*2705Sjp161948 if (!initialized) { 884*2705Sjp161948 /* 885*2705Sjp161948 * iconv_open() fails if the to/from codesets are the 886*2705Sjp161948 * same, and there are aliases of codesets to boot... 887*2705Sjp161948 */ 888*2705Sjp161948 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 889*2705Sjp161948 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 890*2705Sjp161948 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 891*2705Sjp161948 initialized = 1; 892*2705Sjp161948 do_convert = 0; 893*2705Sjp161948 } else { 894*2705Sjp161948 cd = iconv_open("646", nl_langinfo(CODESET)); 895*2705Sjp161948 if (cd == (iconv_t)-1) { 896*2705Sjp161948 if (err_ptr) 897*2705Sjp161948 *err_ptr = errno; 898*2705Sjp161948 if (error_str) 899*2705Sjp161948 *error_str = (uchar_t *)"Cannot " 900*2705Sjp161948 "convert UTF-8 strings to the " 901*2705Sjp161948 "local codeset"; 902*2705Sjp161948 } 903*2705Sjp161948 initialized = 1; 904*2705Sjp161948 do_convert = 1; 905*2705Sjp161948 } 9060Sstevel@tonic-gate } 9070Sstevel@tonic-gate 908*2705Sjp161948 if (!do_convert) 909*2705Sjp161948 return (xstrdup((char *)str)); 910*2705Sjp161948 911*2705Sjp161948 return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 9120Sstevel@tonic-gate } 9130Sstevel@tonic-gate 914*2705Sjp161948 uchar_t * 915*2705Sjp161948 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 9160Sstevel@tonic-gate { 917*2705Sjp161948 static uint_t initialized = 0; 918*2705Sjp161948 static uint_t do_convert = 0; 919*2705Sjp161948 iconv_t cd; 9200Sstevel@tonic-gate 921*2705Sjp161948 if (!initialized) { 922*2705Sjp161948 /* 923*2705Sjp161948 * iconv_open() fails if the to/from codesets are the 924*2705Sjp161948 * same, and there are aliases of codesets to boot... 925*2705Sjp161948 */ 926*2705Sjp161948 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 927*2705Sjp161948 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 928*2705Sjp161948 initialized = 1; 929*2705Sjp161948 do_convert = 0; 930*2705Sjp161948 } else { 931*2705Sjp161948 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 932*2705Sjp161948 if (cd == (iconv_t)-1) { 933*2705Sjp161948 if (err_ptr) 934*2705Sjp161948 *err_ptr = errno; 935*2705Sjp161948 if (error_str) 936*2705Sjp161948 *error_str = (uchar_t *)"Cannot " 937*2705Sjp161948 "convert UTF-8 strings to the " 938*2705Sjp161948 "local codeset"; 939*2705Sjp161948 } 940*2705Sjp161948 initialized = 1; 941*2705Sjp161948 do_convert = 1; 942*2705Sjp161948 } 9430Sstevel@tonic-gate } 9440Sstevel@tonic-gate 945*2705Sjp161948 if (!do_convert) 946*2705Sjp161948 return ((uchar_t *)xstrdup((char *)str)); 947*2705Sjp161948 948*2705Sjp161948 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 9490Sstevel@tonic-gate } 9500Sstevel@tonic-gate 9510Sstevel@tonic-gate 9520Sstevel@tonic-gate /* 9530Sstevel@tonic-gate * Wrapper around iconv() 9540Sstevel@tonic-gate * 9550Sstevel@tonic-gate * The caller is responsible for freeing the result and for handling 9560Sstevel@tonic-gate * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 9570Sstevel@tonic-gate */ 958*2705Sjp161948 static uchar_t * 959*2705Sjp161948 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len, 960*2705Sjp161948 uint_t *outlen, int *err, uchar_t **err_str) 961*2705Sjp161948 { 962*2705Sjp161948 size_t inbytesleft, outbytesleft, converted_size; 963*2705Sjp161948 char *outbuf; 964*2705Sjp161948 uchar_t *converted; 965*2705Sjp161948 const char *inbuf; 966*2705Sjp161948 uint_t mul = 0; 9670Sstevel@tonic-gate 968*2705Sjp161948 if (!buf || !(*(char *)buf)) 969*2705Sjp161948 return (NULL); 970*2705Sjp161948 971*2705Sjp161948 if (len == 0) 972*2705Sjp161948 len = strlen(buf); 973*2705Sjp161948 974*2705Sjp161948 /* reset conversion descriptor */ 975*2705Sjp161948 /* XXX Do we need initial shift sequences for UTF-8??? */ 976*2705Sjp161948 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 977*2705Sjp161948 inbuf = (const char *) buf; 978*2705Sjp161948 979*2705Sjp161948 if (mul_ptr) 980*2705Sjp161948 mul = *mul_ptr; 981*2705Sjp161948 982*2705Sjp161948 converted_size = (len << mul); 983*2705Sjp161948 outbuf = (char *)xmalloc(converted_size + 1); /* for null */ 984*2705Sjp161948 converted = (uchar_t *)outbuf; 985*2705Sjp161948 outbytesleft = len; 9860Sstevel@tonic-gate 987*2705Sjp161948 do { 988*2705Sjp161948 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 989*2705Sjp161948 (size_t)-1) { 990*2705Sjp161948 if (errno == E2BIG) { 991*2705Sjp161948 /* UTF-8 codepoints are at most 8 bytes long */ 992*2705Sjp161948 if (mul > 2) { 993*2705Sjp161948 if (err_str) 994*2705Sjp161948 *err_str = (uchar_t *) 995*2705Sjp161948 "Conversion to UTF-8 failed" 996*2705Sjp161948 " due to preposterous space" 997*2705Sjp161948 " requirements"; 998*2705Sjp161948 if (err) 999*2705Sjp161948 *err = EILSEQ; 1000*2705Sjp161948 return (NULL); 1001*2705Sjp161948 } 10020Sstevel@tonic-gate 1003*2705Sjp161948 /* 1004*2705Sjp161948 * re-alloc output and ensure that the outbuf 1005*2705Sjp161948 * and outbytesleft values are adjusted 1006*2705Sjp161948 */ 1007*2705Sjp161948 converted = xrealloc(converted, 1008*2705Sjp161948 converted_size << 1 + 1); 1009*2705Sjp161948 outbuf = (char *)converted + converted_size - 1010*2705Sjp161948 outbytesleft; 1011*2705Sjp161948 converted_size = (len << ++(mul)); 1012*2705Sjp161948 outbytesleft = converted_size - outbytesleft; 1013*2705Sjp161948 } else { 1014*2705Sjp161948 /* 1015*2705Sjp161948 * let the caller deal with iconv() errors, 1016*2705Sjp161948 * probably by calling fatal(); xfree() does 1017*2705Sjp161948 * not set errno 1018*2705Sjp161948 */ 1019*2705Sjp161948 if (err) 1020*2705Sjp161948 *err = errno; 1021*2705Sjp161948 xfree(converted); 1022*2705Sjp161948 return (NULL); 1023*2705Sjp161948 } 1024*2705Sjp161948 } 1025*2705Sjp161948 } while (inbytesleft); 1026*2705Sjp161948 1027*2705Sjp161948 *outbuf = '\0'; /* ensure null-termination */ 1028*2705Sjp161948 if (outlen) 1029*2705Sjp161948 *outlen = converted_size - outbytesleft; 1030*2705Sjp161948 if (mul_ptr) 1031*2705Sjp161948 *mul_ptr = mul; 1032*2705Sjp161948 1033*2705Sjp161948 return (converted); 10340Sstevel@tonic-gate } 1035