18271SGordon.Ross@Sun.COM /*
28271SGordon.Ross@Sun.COM * CDDL HEADER START
38271SGordon.Ross@Sun.COM *
48271SGordon.Ross@Sun.COM * The contents of this file are subject to the terms of the
58271SGordon.Ross@Sun.COM * Common Development and Distribution License (the "License").
68271SGordon.Ross@Sun.COM * You may not use this file except in compliance with the License.
78271SGordon.Ross@Sun.COM *
88271SGordon.Ross@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
98271SGordon.Ross@Sun.COM * or http://www.opensolaris.org/os/licensing.
108271SGordon.Ross@Sun.COM * See the License for the specific language governing permissions
118271SGordon.Ross@Sun.COM * and limitations under the License.
128271SGordon.Ross@Sun.COM *
138271SGordon.Ross@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each
148271SGordon.Ross@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
158271SGordon.Ross@Sun.COM * If applicable, add the following below this CDDL HEADER, with the
168271SGordon.Ross@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying
178271SGordon.Ross@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner]
188271SGordon.Ross@Sun.COM *
198271SGordon.Ross@Sun.COM * CDDL HEADER END
208271SGordon.Ross@Sun.COM */
218271SGordon.Ross@Sun.COM
228271SGordon.Ross@Sun.COM /*
23*10023SGordon.Ross@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
248271SGordon.Ross@Sun.COM * Use is subject to license terms.
258271SGordon.Ross@Sun.COM */
268271SGordon.Ross@Sun.COM
278271SGordon.Ross@Sun.COM /*
288271SGordon.Ross@Sun.COM * Unicode conversions (yet more)
298271SGordon.Ross@Sun.COM */
308271SGordon.Ross@Sun.COM
318271SGordon.Ross@Sun.COM #include <stdio.h>
328271SGordon.Ross@Sun.COM #include <stdlib.h>
338271SGordon.Ross@Sun.COM #include <string.h>
348271SGordon.Ross@Sun.COM #include <errno.h>
358271SGordon.Ross@Sun.COM #include <iconv.h>
368271SGordon.Ross@Sun.COM #include <libintl.h>
378271SGordon.Ross@Sun.COM
388271SGordon.Ross@Sun.COM #include <sys/u8_textprep.h>
398271SGordon.Ross@Sun.COM
408271SGordon.Ross@Sun.COM #include <netsmb/smb_lib.h>
418271SGordon.Ross@Sun.COM #include "charsets.h"
428271SGordon.Ross@Sun.COM
438271SGordon.Ross@Sun.COM
448271SGordon.Ross@Sun.COM /*
458271SGordon.Ross@Sun.COM * Number of unicode symbols in the string,
468271SGordon.Ross@Sun.COM * not including the 2-byte null terminator.
478271SGordon.Ross@Sun.COM * (multiply by two for storage size)
488271SGordon.Ross@Sun.COM */
498271SGordon.Ross@Sun.COM size_t
unicode_strlen(const uint16_t * us)508271SGordon.Ross@Sun.COM unicode_strlen(const uint16_t *us)
518271SGordon.Ross@Sun.COM {
528271SGordon.Ross@Sun.COM size_t len = 0;
538271SGordon.Ross@Sun.COM while (*us++)
548271SGordon.Ross@Sun.COM len++;
558271SGordon.Ross@Sun.COM return (len);
568271SGordon.Ross@Sun.COM }
578271SGordon.Ross@Sun.COM
588271SGordon.Ross@Sun.COM static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
598271SGordon.Ross@Sun.COM
608271SGordon.Ross@Sun.COM /*
618271SGordon.Ross@Sun.COM * Convert (native) Unicode string to UTF-8.
628271SGordon.Ross@Sun.COM * Returns allocated memory.
638271SGordon.Ross@Sun.COM */
648271SGordon.Ross@Sun.COM char *
convert_unicode_to_utf8(uint16_t * us)658271SGordon.Ross@Sun.COM convert_unicode_to_utf8(uint16_t *us)
668271SGordon.Ross@Sun.COM {
678271SGordon.Ross@Sun.COM static iconv_t cd1 = (iconv_t)-1;
688271SGordon.Ross@Sun.COM
698271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */
708271SGordon.Ross@Sun.COM if (cd1 == (iconv_t)-1)
718271SGordon.Ross@Sun.COM cd1 = iconv_open("UTF-8", "UCS-2");
728271SGordon.Ross@Sun.COM
738271SGordon.Ross@Sun.COM return (convert_ucs2xx_to_utf8(cd1, us));
748271SGordon.Ross@Sun.COM }
758271SGordon.Ross@Sun.COM
768271SGordon.Ross@Sun.COM /*
778271SGordon.Ross@Sun.COM * Convert little-endian Unicode string to UTF-8.
788271SGordon.Ross@Sun.COM * Returns allocated memory.
798271SGordon.Ross@Sun.COM */
808271SGordon.Ross@Sun.COM char *
convert_leunicode_to_utf8(unsigned short * us)818271SGordon.Ross@Sun.COM convert_leunicode_to_utf8(unsigned short *us)
828271SGordon.Ross@Sun.COM {
838271SGordon.Ross@Sun.COM static iconv_t cd2 = (iconv_t)-1;
848271SGordon.Ross@Sun.COM
858271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */
868271SGordon.Ross@Sun.COM if (cd2 == (iconv_t)-1)
878271SGordon.Ross@Sun.COM cd2 = iconv_open("UTF-8", "UCS-2LE");
888271SGordon.Ross@Sun.COM
898271SGordon.Ross@Sun.COM return (convert_ucs2xx_to_utf8(cd2, us));
908271SGordon.Ross@Sun.COM }
918271SGordon.Ross@Sun.COM
928271SGordon.Ross@Sun.COM static char *
convert_ucs2xx_to_utf8(iconv_t cd,const uint16_t * us)938271SGordon.Ross@Sun.COM convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
948271SGordon.Ross@Sun.COM {
958271SGordon.Ross@Sun.COM char *obuf, *optr;
968271SGordon.Ross@Sun.COM const char *iptr;
978271SGordon.Ross@Sun.COM size_t ileft, obsize, oleft, ret;
988271SGordon.Ross@Sun.COM
998271SGordon.Ross@Sun.COM if (cd == (iconv_t)-1) {
1008271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1018271SGordon.Ross@Sun.COM "iconv_open(UTF-8/UCS-2)"), -1);
1028271SGordon.Ross@Sun.COM return (NULL);
1038271SGordon.Ross@Sun.COM }
1048271SGordon.Ross@Sun.COM
1058271SGordon.Ross@Sun.COM iptr = (const char *)us;
1068271SGordon.Ross@Sun.COM ileft = unicode_strlen(us);
1078271SGordon.Ross@Sun.COM ileft *= 2; /* now bytes */
1088271SGordon.Ross@Sun.COM
1098271SGordon.Ross@Sun.COM /* Worst-case output size is 2x input size. */
1108271SGordon.Ross@Sun.COM oleft = ileft * 2;
1118271SGordon.Ross@Sun.COM obsize = oleft + 2; /* room for null */
1128271SGordon.Ross@Sun.COM obuf = malloc(obsize);
1138271SGordon.Ross@Sun.COM if (!obuf)
1148271SGordon.Ross@Sun.COM return (NULL);
1158271SGordon.Ross@Sun.COM optr = obuf;
1168271SGordon.Ross@Sun.COM
1178271SGordon.Ross@Sun.COM ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
1188271SGordon.Ross@Sun.COM *optr = '\0';
1198271SGordon.Ross@Sun.COM if (ret == (size_t)-1) {
1208271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1218271SGordon.Ross@Sun.COM "iconv(%s) failed"), errno, obuf);
1228271SGordon.Ross@Sun.COM }
1238271SGordon.Ross@Sun.COM if (ileft) {
1248271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1258271SGordon.Ross@Sun.COM "iconv(%s) failed"), -1, obuf);
1268271SGordon.Ross@Sun.COM /*
1278271SGordon.Ross@Sun.COM * XXX: What's better? return NULL?
1288271SGordon.Ross@Sun.COM * The truncated string? << for now
1298271SGordon.Ross@Sun.COM */
1308271SGordon.Ross@Sun.COM }
1318271SGordon.Ross@Sun.COM
1328271SGordon.Ross@Sun.COM return (obuf);
1338271SGordon.Ross@Sun.COM }
1348271SGordon.Ross@Sun.COM
1358271SGordon.Ross@Sun.COM static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
1368271SGordon.Ross@Sun.COM
1378271SGordon.Ross@Sun.COM /*
1388271SGordon.Ross@Sun.COM * Convert UTF-8 string to Unicode.
1398271SGordon.Ross@Sun.COM * Returns allocated memory.
1408271SGordon.Ross@Sun.COM */
1418271SGordon.Ross@Sun.COM uint16_t *
convert_utf8_to_unicode(const char * utf8_string)1428271SGordon.Ross@Sun.COM convert_utf8_to_unicode(const char *utf8_string)
1438271SGordon.Ross@Sun.COM {
1448271SGordon.Ross@Sun.COM static iconv_t cd3 = (iconv_t)-1;
1458271SGordon.Ross@Sun.COM
1468271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */
1478271SGordon.Ross@Sun.COM if (cd3 == (iconv_t)-1)
1488271SGordon.Ross@Sun.COM cd3 = iconv_open("UCS-2", "UTF-8");
1498271SGordon.Ross@Sun.COM return (convert_utf8_to_ucs2xx(cd3, utf8_string));
1508271SGordon.Ross@Sun.COM }
1518271SGordon.Ross@Sun.COM
1528271SGordon.Ross@Sun.COM /*
1538271SGordon.Ross@Sun.COM * Convert UTF-8 string to little-endian Unicode.
1548271SGordon.Ross@Sun.COM * Returns allocated memory.
1558271SGordon.Ross@Sun.COM */
1568271SGordon.Ross@Sun.COM uint16_t *
convert_utf8_to_leunicode(const char * utf8_string)1578271SGordon.Ross@Sun.COM convert_utf8_to_leunicode(const char *utf8_string)
1588271SGordon.Ross@Sun.COM {
1598271SGordon.Ross@Sun.COM static iconv_t cd4 = (iconv_t)-1;
1608271SGordon.Ross@Sun.COM
1618271SGordon.Ross@Sun.COM /* Get conversion descriptor (to, from) */
1628271SGordon.Ross@Sun.COM if (cd4 == (iconv_t)-1)
1638271SGordon.Ross@Sun.COM cd4 = iconv_open("UCS-2LE", "UTF-8");
1648271SGordon.Ross@Sun.COM return (convert_utf8_to_ucs2xx(cd4, utf8_string));
1658271SGordon.Ross@Sun.COM }
1668271SGordon.Ross@Sun.COM
1678271SGordon.Ross@Sun.COM static uint16_t *
convert_utf8_to_ucs2xx(iconv_t cd,const char * utf8_string)1688271SGordon.Ross@Sun.COM convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
1698271SGordon.Ross@Sun.COM {
1708271SGordon.Ross@Sun.COM uint16_t *obuf, *optr;
1718271SGordon.Ross@Sun.COM const char *iptr;
1728271SGordon.Ross@Sun.COM size_t ileft, obsize, oleft, ret;
1738271SGordon.Ross@Sun.COM
1748271SGordon.Ross@Sun.COM if (cd == (iconv_t)-1) {
1758271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1768271SGordon.Ross@Sun.COM "iconv_open(UCS-2/UTF-8)"), -1);
1778271SGordon.Ross@Sun.COM return (NULL);
1788271SGordon.Ross@Sun.COM }
1798271SGordon.Ross@Sun.COM
1808271SGordon.Ross@Sun.COM iptr = utf8_string;
1818271SGordon.Ross@Sun.COM ileft = strlen(iptr);
1828271SGordon.Ross@Sun.COM
1838271SGordon.Ross@Sun.COM /* Worst-case output size is 2x input size. */
1848271SGordon.Ross@Sun.COM oleft = ileft * 2;
1858271SGordon.Ross@Sun.COM obsize = oleft + 2; /* room for null */
1868271SGordon.Ross@Sun.COM obuf = malloc(obsize);
1878271SGordon.Ross@Sun.COM if (!obuf)
1888271SGordon.Ross@Sun.COM return (NULL);
1898271SGordon.Ross@Sun.COM optr = obuf;
1908271SGordon.Ross@Sun.COM
1918271SGordon.Ross@Sun.COM ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
1928271SGordon.Ross@Sun.COM *optr = '\0';
1938271SGordon.Ross@Sun.COM if (ret == (size_t)-1) {
1948271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1958271SGordon.Ross@Sun.COM "iconv(%s) failed"), errno, utf8_string);
1968271SGordon.Ross@Sun.COM }
1978271SGordon.Ross@Sun.COM if (ileft) {
1988271SGordon.Ross@Sun.COM smb_error(dgettext(TEXT_DOMAIN,
1998271SGordon.Ross@Sun.COM "iconv(%s) failed"), -1, utf8_string);
2008271SGordon.Ross@Sun.COM /*
2018271SGordon.Ross@Sun.COM * XXX: What's better? return NULL?
2028271SGordon.Ross@Sun.COM * The truncated string? << for now
2038271SGordon.Ross@Sun.COM */
2048271SGordon.Ross@Sun.COM }
2058271SGordon.Ross@Sun.COM
2068271SGordon.Ross@Sun.COM return (obuf);
2078271SGordon.Ross@Sun.COM }
208*10023SGordon.Ross@Sun.COM
209*10023SGordon.Ross@Sun.COM
210*10023SGordon.Ross@Sun.COM /*
211*10023SGordon.Ross@Sun.COM * A simple wrapper around u8_textprep_str() that returns the Unicode
212*10023SGordon.Ross@Sun.COM * upper-case version of some string. Returns memory from malloc.
213*10023SGordon.Ross@Sun.COM * Borrowed from idmapd.
214*10023SGordon.Ross@Sun.COM */
215*10023SGordon.Ross@Sun.COM static char *
utf8_str_to_upper_or_lower(const char * s,int upper_lower)216*10023SGordon.Ross@Sun.COM utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217*10023SGordon.Ross@Sun.COM {
218*10023SGordon.Ross@Sun.COM char *res = NULL;
219*10023SGordon.Ross@Sun.COM char *outs;
220*10023SGordon.Ross@Sun.COM size_t inlen, outlen, inbleft, outbleft;
221*10023SGordon.Ross@Sun.COM int rc, err;
222*10023SGordon.Ross@Sun.COM
223*10023SGordon.Ross@Sun.COM /*
224*10023SGordon.Ross@Sun.COM * u8_textprep_str() does not allocate memory. The input and
225*10023SGordon.Ross@Sun.COM * output buffers may differ in size (though that would be more
226*10023SGordon.Ross@Sun.COM * likely when normalization is done). We have to loop over it...
227*10023SGordon.Ross@Sun.COM *
228*10023SGordon.Ross@Sun.COM * To improve the chances that we can avoid looping we add 10
229*10023SGordon.Ross@Sun.COM * bytes of output buffer room the first go around.
230*10023SGordon.Ross@Sun.COM */
231*10023SGordon.Ross@Sun.COM inlen = inbleft = strlen(s);
232*10023SGordon.Ross@Sun.COM outlen = outbleft = inlen + 10;
233*10023SGordon.Ross@Sun.COM if ((res = malloc(outlen)) == NULL)
234*10023SGordon.Ross@Sun.COM return (NULL);
235*10023SGordon.Ross@Sun.COM outs = res;
236*10023SGordon.Ross@Sun.COM
237*10023SGordon.Ross@Sun.COM while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238*10023SGordon.Ross@Sun.COM &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239*10023SGordon.Ross@Sun.COM err == E2BIG) {
240*10023SGordon.Ross@Sun.COM if ((res = realloc(res, outlen + inbleft)) == NULL)
241*10023SGordon.Ross@Sun.COM return (NULL);
242*10023SGordon.Ross@Sun.COM /* adjust input/output buffer pointers */
243*10023SGordon.Ross@Sun.COM s += (inlen - inbleft);
244*10023SGordon.Ross@Sun.COM outs = res + outlen - outbleft;
245*10023SGordon.Ross@Sun.COM /* adjust outbleft and outlen */
246*10023SGordon.Ross@Sun.COM outlen += inbleft;
247*10023SGordon.Ross@Sun.COM outbleft += inbleft;
248*10023SGordon.Ross@Sun.COM }
249*10023SGordon.Ross@Sun.COM
250*10023SGordon.Ross@Sun.COM if (rc < 0) {
251*10023SGordon.Ross@Sun.COM free(res);
252*10023SGordon.Ross@Sun.COM res = NULL;
253*10023SGordon.Ross@Sun.COM return (NULL);
254*10023SGordon.Ross@Sun.COM }
255*10023SGordon.Ross@Sun.COM
256*10023SGordon.Ross@Sun.COM res[outlen - outbleft] = '\0';
257*10023SGordon.Ross@Sun.COM
258*10023SGordon.Ross@Sun.COM return (res);
259*10023SGordon.Ross@Sun.COM }
260*10023SGordon.Ross@Sun.COM
261*10023SGordon.Ross@Sun.COM char *
utf8_str_toupper(const char * s)262*10023SGordon.Ross@Sun.COM utf8_str_toupper(const char *s)
263*10023SGordon.Ross@Sun.COM {
264*10023SGordon.Ross@Sun.COM return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265*10023SGordon.Ross@Sun.COM }
266*10023SGordon.Ross@Sun.COM
267*10023SGordon.Ross@Sun.COM char *
utf8_str_tolower(const char * s)268*10023SGordon.Ross@Sun.COM utf8_str_tolower(const char *s)
269*10023SGordon.Ross@Sun.COM {
270*10023SGordon.Ross@Sun.COM return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271*10023SGordon.Ross@Sun.COM }
272