1*8271SGordon.Ross@Sun.COM /*
2*8271SGordon.Ross@Sun.COM  * CDDL HEADER START
3*8271SGordon.Ross@Sun.COM  *
4*8271SGordon.Ross@Sun.COM  * The contents of this file are subject to the terms of the
5*8271SGordon.Ross@Sun.COM  * Common Development and Distribution License (the "License").
6*8271SGordon.Ross@Sun.COM  * You may not use this file except in compliance with the License.
7*8271SGordon.Ross@Sun.COM  *
8*8271SGordon.Ross@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*8271SGordon.Ross@Sun.COM  * or http://www.opensolaris.org/os/licensing.
10*8271SGordon.Ross@Sun.COM  * See the License for the specific language governing permissions
11*8271SGordon.Ross@Sun.COM  * and limitations under the License.
12*8271SGordon.Ross@Sun.COM  *
13*8271SGordon.Ross@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
14*8271SGordon.Ross@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*8271SGordon.Ross@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
16*8271SGordon.Ross@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
17*8271SGordon.Ross@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
18*8271SGordon.Ross@Sun.COM  *
19*8271SGordon.Ross@Sun.COM  * CDDL HEADER END
20*8271SGordon.Ross@Sun.COM  */
21*8271SGordon.Ross@Sun.COM 
22*8271SGordon.Ross@Sun.COM /*
23*8271SGordon.Ross@Sun.COM  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24*8271SGordon.Ross@Sun.COM  * Use is subject to license terms.
25*8271SGordon.Ross@Sun.COM  */
26*8271SGordon.Ross@Sun.COM 
27*8271SGordon.Ross@Sun.COM /*
28*8271SGordon.Ross@Sun.COM  * Unicode conversions (yet more)
29*8271SGordon.Ross@Sun.COM  */
30*8271SGordon.Ross@Sun.COM 
31*8271SGordon.Ross@Sun.COM #include <stdio.h>
32*8271SGordon.Ross@Sun.COM #include <stdlib.h>
33*8271SGordon.Ross@Sun.COM #include <string.h>
34*8271SGordon.Ross@Sun.COM #include <errno.h>
35*8271SGordon.Ross@Sun.COM #include <iconv.h>
36*8271SGordon.Ross@Sun.COM #include <libintl.h>
37*8271SGordon.Ross@Sun.COM 
38*8271SGordon.Ross@Sun.COM #include <sys/u8_textprep.h>
39*8271SGordon.Ross@Sun.COM 
40*8271SGordon.Ross@Sun.COM #include <netsmb/smb_lib.h>
41*8271SGordon.Ross@Sun.COM #include "charsets.h"
42*8271SGordon.Ross@Sun.COM 
43*8271SGordon.Ross@Sun.COM 
44*8271SGordon.Ross@Sun.COM /*
45*8271SGordon.Ross@Sun.COM  * Number of unicode symbols in the string,
46*8271SGordon.Ross@Sun.COM  * not including the 2-byte null terminator.
47*8271SGordon.Ross@Sun.COM  * (multiply by two for storage size)
48*8271SGordon.Ross@Sun.COM  */
49*8271SGordon.Ross@Sun.COM size_t
50*8271SGordon.Ross@Sun.COM unicode_strlen(const uint16_t *us)
51*8271SGordon.Ross@Sun.COM {
52*8271SGordon.Ross@Sun.COM 	size_t len = 0;
53*8271SGordon.Ross@Sun.COM 	while (*us++)
54*8271SGordon.Ross@Sun.COM 		len++;
55*8271SGordon.Ross@Sun.COM 	return (len);
56*8271SGordon.Ross@Sun.COM }
57*8271SGordon.Ross@Sun.COM 
58*8271SGordon.Ross@Sun.COM static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
59*8271SGordon.Ross@Sun.COM 
60*8271SGordon.Ross@Sun.COM /*
61*8271SGordon.Ross@Sun.COM  * Convert (native) Unicode string to UTF-8.
62*8271SGordon.Ross@Sun.COM  * Returns allocated memory.
63*8271SGordon.Ross@Sun.COM  */
64*8271SGordon.Ross@Sun.COM char *
65*8271SGordon.Ross@Sun.COM convert_unicode_to_utf8(uint16_t *us)
66*8271SGordon.Ross@Sun.COM {
67*8271SGordon.Ross@Sun.COM 	static iconv_t cd1 = (iconv_t)-1;
68*8271SGordon.Ross@Sun.COM 
69*8271SGordon.Ross@Sun.COM 	/* Get conversion descriptor (to, from) */
70*8271SGordon.Ross@Sun.COM 	if (cd1 == (iconv_t)-1)
71*8271SGordon.Ross@Sun.COM 		cd1 = iconv_open("UTF-8", "UCS-2");
72*8271SGordon.Ross@Sun.COM 
73*8271SGordon.Ross@Sun.COM 	return (convert_ucs2xx_to_utf8(cd1, us));
74*8271SGordon.Ross@Sun.COM }
75*8271SGordon.Ross@Sun.COM 
76*8271SGordon.Ross@Sun.COM /*
77*8271SGordon.Ross@Sun.COM  * Convert little-endian Unicode string to UTF-8.
78*8271SGordon.Ross@Sun.COM  * Returns allocated memory.
79*8271SGordon.Ross@Sun.COM  */
80*8271SGordon.Ross@Sun.COM char *
81*8271SGordon.Ross@Sun.COM convert_leunicode_to_utf8(unsigned short *us)
82*8271SGordon.Ross@Sun.COM {
83*8271SGordon.Ross@Sun.COM 	static iconv_t cd2 = (iconv_t)-1;
84*8271SGordon.Ross@Sun.COM 
85*8271SGordon.Ross@Sun.COM 	/* Get conversion descriptor (to, from) */
86*8271SGordon.Ross@Sun.COM 	if (cd2 == (iconv_t)-1)
87*8271SGordon.Ross@Sun.COM 		cd2 = iconv_open("UTF-8", "UCS-2LE");
88*8271SGordon.Ross@Sun.COM 
89*8271SGordon.Ross@Sun.COM 	return (convert_ucs2xx_to_utf8(cd2, us));
90*8271SGordon.Ross@Sun.COM }
91*8271SGordon.Ross@Sun.COM 
92*8271SGordon.Ross@Sun.COM static char *
93*8271SGordon.Ross@Sun.COM convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
94*8271SGordon.Ross@Sun.COM {
95*8271SGordon.Ross@Sun.COM 	char *obuf, *optr;
96*8271SGordon.Ross@Sun.COM 	const char *iptr;
97*8271SGordon.Ross@Sun.COM 	size_t  ileft, obsize, oleft, ret;
98*8271SGordon.Ross@Sun.COM 
99*8271SGordon.Ross@Sun.COM 	if (cd == (iconv_t)-1) {
100*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
101*8271SGordon.Ross@Sun.COM 		    "iconv_open(UTF-8/UCS-2)"), -1);
102*8271SGordon.Ross@Sun.COM 		return (NULL);
103*8271SGordon.Ross@Sun.COM 	}
104*8271SGordon.Ross@Sun.COM 
105*8271SGordon.Ross@Sun.COM 	iptr = (const char *)us;
106*8271SGordon.Ross@Sun.COM 	ileft = unicode_strlen(us);
107*8271SGordon.Ross@Sun.COM 	ileft *= 2; /* now bytes */
108*8271SGordon.Ross@Sun.COM 
109*8271SGordon.Ross@Sun.COM 	/* Worst-case output size is 2x input size. */
110*8271SGordon.Ross@Sun.COM 	oleft = ileft * 2;
111*8271SGordon.Ross@Sun.COM 	obsize = oleft + 2; /* room for null */
112*8271SGordon.Ross@Sun.COM 	obuf = malloc(obsize);
113*8271SGordon.Ross@Sun.COM 	if (!obuf)
114*8271SGordon.Ross@Sun.COM 		return (NULL);
115*8271SGordon.Ross@Sun.COM 	optr = obuf;
116*8271SGordon.Ross@Sun.COM 
117*8271SGordon.Ross@Sun.COM 	ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
118*8271SGordon.Ross@Sun.COM 	*optr = '\0';
119*8271SGordon.Ross@Sun.COM 	if (ret == (size_t)-1) {
120*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
121*8271SGordon.Ross@Sun.COM 		    "iconv(%s) failed"), errno, obuf);
122*8271SGordon.Ross@Sun.COM 	}
123*8271SGordon.Ross@Sun.COM 	if (ileft) {
124*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
125*8271SGordon.Ross@Sun.COM 		    "iconv(%s) failed"), -1, obuf);
126*8271SGordon.Ross@Sun.COM 		/*
127*8271SGordon.Ross@Sun.COM 		 * XXX: What's better?  return NULL?
128*8271SGordon.Ross@Sun.COM 		 * The truncated string? << for now
129*8271SGordon.Ross@Sun.COM 		 */
130*8271SGordon.Ross@Sun.COM 	}
131*8271SGordon.Ross@Sun.COM 
132*8271SGordon.Ross@Sun.COM 	return (obuf);
133*8271SGordon.Ross@Sun.COM }
134*8271SGordon.Ross@Sun.COM 
135*8271SGordon.Ross@Sun.COM static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
136*8271SGordon.Ross@Sun.COM 
137*8271SGordon.Ross@Sun.COM /*
138*8271SGordon.Ross@Sun.COM  * Convert UTF-8 string to Unicode.
139*8271SGordon.Ross@Sun.COM  * Returns allocated memory.
140*8271SGordon.Ross@Sun.COM  */
141*8271SGordon.Ross@Sun.COM uint16_t *
142*8271SGordon.Ross@Sun.COM convert_utf8_to_unicode(const char *utf8_string)
143*8271SGordon.Ross@Sun.COM {
144*8271SGordon.Ross@Sun.COM 	static iconv_t cd3 = (iconv_t)-1;
145*8271SGordon.Ross@Sun.COM 
146*8271SGordon.Ross@Sun.COM 	/* Get conversion descriptor (to, from) */
147*8271SGordon.Ross@Sun.COM 	if (cd3 == (iconv_t)-1)
148*8271SGordon.Ross@Sun.COM 		cd3 = iconv_open("UCS-2", "UTF-8");
149*8271SGordon.Ross@Sun.COM 	return (convert_utf8_to_ucs2xx(cd3, utf8_string));
150*8271SGordon.Ross@Sun.COM }
151*8271SGordon.Ross@Sun.COM 
152*8271SGordon.Ross@Sun.COM /*
153*8271SGordon.Ross@Sun.COM  * Convert UTF-8 string to little-endian Unicode.
154*8271SGordon.Ross@Sun.COM  * Returns allocated memory.
155*8271SGordon.Ross@Sun.COM  */
156*8271SGordon.Ross@Sun.COM uint16_t *
157*8271SGordon.Ross@Sun.COM convert_utf8_to_leunicode(const char *utf8_string)
158*8271SGordon.Ross@Sun.COM {
159*8271SGordon.Ross@Sun.COM 	static iconv_t cd4 = (iconv_t)-1;
160*8271SGordon.Ross@Sun.COM 
161*8271SGordon.Ross@Sun.COM 	/* Get conversion descriptor (to, from) */
162*8271SGordon.Ross@Sun.COM 	if (cd4 == (iconv_t)-1)
163*8271SGordon.Ross@Sun.COM 		cd4 = iconv_open("UCS-2LE", "UTF-8");
164*8271SGordon.Ross@Sun.COM 	return (convert_utf8_to_ucs2xx(cd4, utf8_string));
165*8271SGordon.Ross@Sun.COM }
166*8271SGordon.Ross@Sun.COM 
167*8271SGordon.Ross@Sun.COM static uint16_t *
168*8271SGordon.Ross@Sun.COM convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
169*8271SGordon.Ross@Sun.COM {
170*8271SGordon.Ross@Sun.COM 	uint16_t *obuf, *optr;
171*8271SGordon.Ross@Sun.COM 	const char *iptr;
172*8271SGordon.Ross@Sun.COM 	size_t  ileft, obsize, oleft, ret;
173*8271SGordon.Ross@Sun.COM 
174*8271SGordon.Ross@Sun.COM 	if (cd == (iconv_t)-1) {
175*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
176*8271SGordon.Ross@Sun.COM 		    "iconv_open(UCS-2/UTF-8)"), -1);
177*8271SGordon.Ross@Sun.COM 		return (NULL);
178*8271SGordon.Ross@Sun.COM 	}
179*8271SGordon.Ross@Sun.COM 
180*8271SGordon.Ross@Sun.COM 	iptr = utf8_string;
181*8271SGordon.Ross@Sun.COM 	ileft = strlen(iptr);
182*8271SGordon.Ross@Sun.COM 
183*8271SGordon.Ross@Sun.COM 	/* Worst-case output size is 2x input size. */
184*8271SGordon.Ross@Sun.COM 	oleft = ileft * 2;
185*8271SGordon.Ross@Sun.COM 	obsize = oleft + 2; /* room for null */
186*8271SGordon.Ross@Sun.COM 	obuf = malloc(obsize);
187*8271SGordon.Ross@Sun.COM 	if (!obuf)
188*8271SGordon.Ross@Sun.COM 		return (NULL);
189*8271SGordon.Ross@Sun.COM 	optr = obuf;
190*8271SGordon.Ross@Sun.COM 
191*8271SGordon.Ross@Sun.COM 	ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
192*8271SGordon.Ross@Sun.COM 	*optr = '\0';
193*8271SGordon.Ross@Sun.COM 	if (ret == (size_t)-1) {
194*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
195*8271SGordon.Ross@Sun.COM 		    "iconv(%s) failed"), errno, utf8_string);
196*8271SGordon.Ross@Sun.COM 	}
197*8271SGordon.Ross@Sun.COM 	if (ileft) {
198*8271SGordon.Ross@Sun.COM 		smb_error(dgettext(TEXT_DOMAIN,
199*8271SGordon.Ross@Sun.COM 		    "iconv(%s) failed"), -1, utf8_string);
200*8271SGordon.Ross@Sun.COM 		/*
201*8271SGordon.Ross@Sun.COM 		 * XXX: What's better?  return NULL?
202*8271SGordon.Ross@Sun.COM 		 * The truncated string? << for now
203*8271SGordon.Ross@Sun.COM 		 */
204*8271SGordon.Ross@Sun.COM 	}
205*8271SGordon.Ross@Sun.COM 
206*8271SGordon.Ross@Sun.COM 	return (obuf);
207*8271SGordon.Ross@Sun.COM }
208