18271SGordon.Ross@Sun.COM /* 28271SGordon.Ross@Sun.COM * CDDL HEADER START 38271SGordon.Ross@Sun.COM * 48271SGordon.Ross@Sun.COM * The contents of this file are subject to the terms of the 58271SGordon.Ross@Sun.COM * Common Development and Distribution License (the "License"). 68271SGordon.Ross@Sun.COM * You may not use this file except in compliance with the License. 78271SGordon.Ross@Sun.COM * 88271SGordon.Ross@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 98271SGordon.Ross@Sun.COM * or http://www.opensolaris.org/os/licensing. 108271SGordon.Ross@Sun.COM * See the License for the specific language governing permissions 118271SGordon.Ross@Sun.COM * and limitations under the License. 128271SGordon.Ross@Sun.COM * 138271SGordon.Ross@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each 148271SGordon.Ross@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 158271SGordon.Ross@Sun.COM * If applicable, add the following below this CDDL HEADER, with the 168271SGordon.Ross@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying 178271SGordon.Ross@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner] 188271SGordon.Ross@Sun.COM * 198271SGordon.Ross@Sun.COM * CDDL HEADER END 208271SGordon.Ross@Sun.COM */ 218271SGordon.Ross@Sun.COM 228271SGordon.Ross@Sun.COM /* 23*10023SGordon.Ross@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 248271SGordon.Ross@Sun.COM * Use is subject to license terms. 258271SGordon.Ross@Sun.COM */ 268271SGordon.Ross@Sun.COM 278271SGordon.Ross@Sun.COM /* 288271SGordon.Ross@Sun.COM * Unicode conversions (yet more) 298271SGordon.Ross@Sun.COM */ 308271SGordon.Ross@Sun.COM 318271SGordon.Ross@Sun.COM #include <stdio.h> 328271SGordon.Ross@Sun.COM #include <stdlib.h> 338271SGordon.Ross@Sun.COM #include <string.h> 348271SGordon.Ross@Sun.COM #include <errno.h> 358271SGordon.Ross@Sun.COM #include <iconv.h> 368271SGordon.Ross@Sun.COM #include <libintl.h> 378271SGordon.Ross@Sun.COM 388271SGordon.Ross@Sun.COM #include <sys/u8_textprep.h> 398271SGordon.Ross@Sun.COM 408271SGordon.Ross@Sun.COM #include <netsmb/smb_lib.h> 418271SGordon.Ross@Sun.COM #include "charsets.h" 428271SGordon.Ross@Sun.COM 438271SGordon.Ross@Sun.COM 448271SGordon.Ross@Sun.COM /* 458271SGordon.Ross@Sun.COM * Number of unicode symbols in the string, 468271SGordon.Ross@Sun.COM * not including the 2-byte null terminator. 478271SGordon.Ross@Sun.COM * (multiply by two for storage size) 488271SGordon.Ross@Sun.COM */ 498271SGordon.Ross@Sun.COM size_t 508271SGordon.Ross@Sun.COM unicode_strlen(const uint16_t *us) 518271SGordon.Ross@Sun.COM { 528271SGordon.Ross@Sun.COM size_t len = 0; 538271SGordon.Ross@Sun.COM while (*us++) 548271SGordon.Ross@Sun.COM len++; 558271SGordon.Ross@Sun.COM return (len); 568271SGordon.Ross@Sun.COM } 578271SGordon.Ross@Sun.COM 588271SGordon.Ross@Sun.COM static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *); 598271SGordon.Ross@Sun.COM 608271SGordon.Ross@Sun.COM /* 618271SGordon.Ross@Sun.COM * Convert (native) Unicode string to UTF-8. 628271SGordon.Ross@Sun.COM * Returns allocated memory. 638271SGordon.Ross@Sun.COM */ 648271SGordon.Ross@Sun.COM char * 658271SGordon.Ross@Sun.COM convert_unicode_to_utf8(uint16_t *us) 668271SGordon.Ross@Sun.COM { 678271SGordon.Ross@Sun.COM static iconv_t cd1 = (iconv_t)-1; 688271SGordon.Ross@Sun.COM 698271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */ 708271SGordon.Ross@Sun.COM if (cd1 == (iconv_t)-1) 718271SGordon.Ross@Sun.COM cd1 = iconv_open("UTF-8", "UCS-2"); 728271SGordon.Ross@Sun.COM 738271SGordon.Ross@Sun.COM return (convert_ucs2xx_to_utf8(cd1, us)); 748271SGordon.Ross@Sun.COM } 758271SGordon.Ross@Sun.COM 768271SGordon.Ross@Sun.COM /* 778271SGordon.Ross@Sun.COM * Convert little-endian Unicode string to UTF-8. 788271SGordon.Ross@Sun.COM * Returns allocated memory. 798271SGordon.Ross@Sun.COM */ 808271SGordon.Ross@Sun.COM char * 818271SGordon.Ross@Sun.COM convert_leunicode_to_utf8(unsigned short *us) 828271SGordon.Ross@Sun.COM { 838271SGordon.Ross@Sun.COM static iconv_t cd2 = (iconv_t)-1; 848271SGordon.Ross@Sun.COM 858271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */ 868271SGordon.Ross@Sun.COM if (cd2 == (iconv_t)-1) 878271SGordon.Ross@Sun.COM cd2 = iconv_open("UTF-8", "UCS-2LE"); 888271SGordon.Ross@Sun.COM 898271SGordon.Ross@Sun.COM return (convert_ucs2xx_to_utf8(cd2, us)); 908271SGordon.Ross@Sun.COM } 918271SGordon.Ross@Sun.COM 928271SGordon.Ross@Sun.COM static char * 938271SGordon.Ross@Sun.COM convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us) 948271SGordon.Ross@Sun.COM { 958271SGordon.Ross@Sun.COM char *obuf, *optr; 968271SGordon.Ross@Sun.COM const char *iptr; 978271SGordon.Ross@Sun.COM size_t ileft, obsize, oleft, ret; 988271SGordon.Ross@Sun.COM 998271SGordon.Ross@Sun.COM if (cd == (iconv_t)-1) { 1008271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1018271SGordon.Ross@Sun.COM "iconv_open(UTF-8/UCS-2)"), -1); 1028271SGordon.Ross@Sun.COM return (NULL); 1038271SGordon.Ross@Sun.COM } 1048271SGordon.Ross@Sun.COM 1058271SGordon.Ross@Sun.COM iptr = (const char *)us; 1068271SGordon.Ross@Sun.COM ileft = unicode_strlen(us); 1078271SGordon.Ross@Sun.COM ileft *= 2; /* now bytes */ 1088271SGordon.Ross@Sun.COM 1098271SGordon.Ross@Sun.COM /* Worst-case output size is 2x input size. */ 1108271SGordon.Ross@Sun.COM oleft = ileft * 2; 1118271SGordon.Ross@Sun.COM obsize = oleft + 2; /* room for null */ 1128271SGordon.Ross@Sun.COM obuf = malloc(obsize); 1138271SGordon.Ross@Sun.COM if (!obuf) 1148271SGordon.Ross@Sun.COM return (NULL); 1158271SGordon.Ross@Sun.COM optr = obuf; 1168271SGordon.Ross@Sun.COM 1178271SGordon.Ross@Sun.COM ret = iconv(cd, &iptr, &ileft, &optr, &oleft); 1188271SGordon.Ross@Sun.COM *optr = '\0'; 1198271SGordon.Ross@Sun.COM if (ret == (size_t)-1) { 1208271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1218271SGordon.Ross@Sun.COM "iconv(%s) failed"), errno, obuf); 1228271SGordon.Ross@Sun.COM } 1238271SGordon.Ross@Sun.COM if (ileft) { 1248271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1258271SGordon.Ross@Sun.COM "iconv(%s) failed"), -1, obuf); 1268271SGordon.Ross@Sun.COM /* 1278271SGordon.Ross@Sun.COM * XXX: What's better? return NULL? 1288271SGordon.Ross@Sun.COM * The truncated string? << for now 1298271SGordon.Ross@Sun.COM */ 1308271SGordon.Ross@Sun.COM } 1318271SGordon.Ross@Sun.COM 1328271SGordon.Ross@Sun.COM return (obuf); 1338271SGordon.Ross@Sun.COM } 1348271SGordon.Ross@Sun.COM 1358271SGordon.Ross@Sun.COM static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *); 1368271SGordon.Ross@Sun.COM 1378271SGordon.Ross@Sun.COM /* 1388271SGordon.Ross@Sun.COM * Convert UTF-8 string to Unicode. 1398271SGordon.Ross@Sun.COM * Returns allocated memory. 1408271SGordon.Ross@Sun.COM */ 1418271SGordon.Ross@Sun.COM uint16_t * 1428271SGordon.Ross@Sun.COM convert_utf8_to_unicode(const char *utf8_string) 1438271SGordon.Ross@Sun.COM { 1448271SGordon.Ross@Sun.COM static iconv_t cd3 = (iconv_t)-1; 1458271SGordon.Ross@Sun.COM 1468271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */ 1478271SGordon.Ross@Sun.COM if (cd3 == (iconv_t)-1) 1488271SGordon.Ross@Sun.COM cd3 = iconv_open("UCS-2", "UTF-8"); 1498271SGordon.Ross@Sun.COM return (convert_utf8_to_ucs2xx(cd3, utf8_string)); 1508271SGordon.Ross@Sun.COM } 1518271SGordon.Ross@Sun.COM 1528271SGordon.Ross@Sun.COM /* 1538271SGordon.Ross@Sun.COM * Convert UTF-8 string to little-endian Unicode. 1548271SGordon.Ross@Sun.COM * Returns allocated memory. 1558271SGordon.Ross@Sun.COM */ 1568271SGordon.Ross@Sun.COM uint16_t * 1578271SGordon.Ross@Sun.COM convert_utf8_to_leunicode(const char *utf8_string) 1588271SGordon.Ross@Sun.COM { 1598271SGordon.Ross@Sun.COM static iconv_t cd4 = (iconv_t)-1; 1608271SGordon.Ross@Sun.COM 1618271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */ 1628271SGordon.Ross@Sun.COM if (cd4 == (iconv_t)-1) 1638271SGordon.Ross@Sun.COM cd4 = iconv_open("UCS-2LE", "UTF-8"); 1648271SGordon.Ross@Sun.COM return (convert_utf8_to_ucs2xx(cd4, utf8_string)); 1658271SGordon.Ross@Sun.COM } 1668271SGordon.Ross@Sun.COM 1678271SGordon.Ross@Sun.COM static uint16_t * 1688271SGordon.Ross@Sun.COM convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string) 1698271SGordon.Ross@Sun.COM { 1708271SGordon.Ross@Sun.COM uint16_t *obuf, *optr; 1718271SGordon.Ross@Sun.COM const char *iptr; 1728271SGordon.Ross@Sun.COM size_t ileft, obsize, oleft, ret; 1738271SGordon.Ross@Sun.COM 1748271SGordon.Ross@Sun.COM if (cd == (iconv_t)-1) { 1758271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1768271SGordon.Ross@Sun.COM "iconv_open(UCS-2/UTF-8)"), -1); 1778271SGordon.Ross@Sun.COM return (NULL); 1788271SGordon.Ross@Sun.COM } 1798271SGordon.Ross@Sun.COM 1808271SGordon.Ross@Sun.COM iptr = utf8_string; 1818271SGordon.Ross@Sun.COM ileft = strlen(iptr); 1828271SGordon.Ross@Sun.COM 1838271SGordon.Ross@Sun.COM /* Worst-case output size is 2x input size. */ 1848271SGordon.Ross@Sun.COM oleft = ileft * 2; 1858271SGordon.Ross@Sun.COM obsize = oleft + 2; /* room for null */ 1868271SGordon.Ross@Sun.COM obuf = malloc(obsize); 1878271SGordon.Ross@Sun.COM if (!obuf) 1888271SGordon.Ross@Sun.COM return (NULL); 1898271SGordon.Ross@Sun.COM optr = obuf; 1908271SGordon.Ross@Sun.COM 1918271SGordon.Ross@Sun.COM ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft); 1928271SGordon.Ross@Sun.COM *optr = '\0'; 1938271SGordon.Ross@Sun.COM if (ret == (size_t)-1) { 1948271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1958271SGordon.Ross@Sun.COM "iconv(%s) failed"), errno, utf8_string); 1968271SGordon.Ross@Sun.COM } 1978271SGordon.Ross@Sun.COM if (ileft) { 1988271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN, 1998271SGordon.Ross@Sun.COM "iconv(%s) failed"), -1, utf8_string); 2008271SGordon.Ross@Sun.COM /* 2018271SGordon.Ross@Sun.COM * XXX: What's better? return NULL? 2028271SGordon.Ross@Sun.COM * The truncated string? << for now 2038271SGordon.Ross@Sun.COM */ 2048271SGordon.Ross@Sun.COM } 2058271SGordon.Ross@Sun.COM 2068271SGordon.Ross@Sun.COM return (obuf); 2078271SGordon.Ross@Sun.COM } 208*10023SGordon.Ross@Sun.COM 209*10023SGordon.Ross@Sun.COM 210*10023SGordon.Ross@Sun.COM /* 211*10023SGordon.Ross@Sun.COM * A simple wrapper around u8_textprep_str() that returns the Unicode 212*10023SGordon.Ross@Sun.COM * upper-case version of some string. Returns memory from malloc. 213*10023SGordon.Ross@Sun.COM * Borrowed from idmapd. 214*10023SGordon.Ross@Sun.COM */ 215*10023SGordon.Ross@Sun.COM static char * 216*10023SGordon.Ross@Sun.COM utf8_str_to_upper_or_lower(const char *s, int upper_lower) 217*10023SGordon.Ross@Sun.COM { 218*10023SGordon.Ross@Sun.COM char *res = NULL; 219*10023SGordon.Ross@Sun.COM char *outs; 220*10023SGordon.Ross@Sun.COM size_t inlen, outlen, inbleft, outbleft; 221*10023SGordon.Ross@Sun.COM int rc, err; 222*10023SGordon.Ross@Sun.COM 223*10023SGordon.Ross@Sun.COM /* 224*10023SGordon.Ross@Sun.COM * u8_textprep_str() does not allocate memory. The input and 225*10023SGordon.Ross@Sun.COM * output buffers may differ in size (though that would be more 226*10023SGordon.Ross@Sun.COM * likely when normalization is done). We have to loop over it... 227*10023SGordon.Ross@Sun.COM * 228*10023SGordon.Ross@Sun.COM * To improve the chances that we can avoid looping we add 10 229*10023SGordon.Ross@Sun.COM * bytes of output buffer room the first go around. 230*10023SGordon.Ross@Sun.COM */ 231*10023SGordon.Ross@Sun.COM inlen = inbleft = strlen(s); 232*10023SGordon.Ross@Sun.COM outlen = outbleft = inlen + 10; 233*10023SGordon.Ross@Sun.COM if ((res = malloc(outlen)) == NULL) 234*10023SGordon.Ross@Sun.COM return (NULL); 235*10023SGordon.Ross@Sun.COM outs = res; 236*10023SGordon.Ross@Sun.COM 237*10023SGordon.Ross@Sun.COM while ((rc = u8_textprep_str((char *)s, &inbleft, outs, 238*10023SGordon.Ross@Sun.COM &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 && 239*10023SGordon.Ross@Sun.COM err == E2BIG) { 240*10023SGordon.Ross@Sun.COM if ((res = realloc(res, outlen + inbleft)) == NULL) 241*10023SGordon.Ross@Sun.COM return (NULL); 242*10023SGordon.Ross@Sun.COM /* adjust input/output buffer pointers */ 243*10023SGordon.Ross@Sun.COM s += (inlen - inbleft); 244*10023SGordon.Ross@Sun.COM outs = res + outlen - outbleft; 245*10023SGordon.Ross@Sun.COM /* adjust outbleft and outlen */ 246*10023SGordon.Ross@Sun.COM outlen += inbleft; 247*10023SGordon.Ross@Sun.COM outbleft += inbleft; 248*10023SGordon.Ross@Sun.COM } 249*10023SGordon.Ross@Sun.COM 250*10023SGordon.Ross@Sun.COM if (rc < 0) { 251*10023SGordon.Ross@Sun.COM free(res); 252*10023SGordon.Ross@Sun.COM res = NULL; 253*10023SGordon.Ross@Sun.COM return (NULL); 254*10023SGordon.Ross@Sun.COM } 255*10023SGordon.Ross@Sun.COM 256*10023SGordon.Ross@Sun.COM res[outlen - outbleft] = '\0'; 257*10023SGordon.Ross@Sun.COM 258*10023SGordon.Ross@Sun.COM return (res); 259*10023SGordon.Ross@Sun.COM } 260*10023SGordon.Ross@Sun.COM 261*10023SGordon.Ross@Sun.COM char * 262*10023SGordon.Ross@Sun.COM utf8_str_toupper(const char *s) 263*10023SGordon.Ross@Sun.COM { 264*10023SGordon.Ross@Sun.COM return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER)); 265*10023SGordon.Ross@Sun.COM } 266*10023SGordon.Ross@Sun.COM 267*10023SGordon.Ross@Sun.COM char * 268*10023SGordon.Ross@Sun.COM utf8_str_tolower(const char *s) 269*10023SGordon.Ross@Sun.COM { 270*10023SGordon.Ross@Sun.COM return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER)); 271*10023SGordon.Ross@Sun.COM } 272