1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Multibyte/wide-char conversion routines. Wide-char encoding provides
28 * a fixed size character encoding that maps to the Unicode 16-bit
29 * (UCS-2) character set standard. Multibyte or UCS transformation
30 * format (UTF) encoding is a variable length character encoding scheme
31 * that s compatible with existing ASCII characters and guarantees that
32 * the resultant strings do not contain embedded null characters. Both
33 * types of encoding provide a null terminator: single byte for UTF-8
34 * and a wide-char null for Unicode. See RFC 2044.
35 *
36 * The table below illustrates the UTF-8 encoding scheme. The letter x
37 * indicates bits available for encoding the character value.
38 *
39 * UCS-2 UTF-8 octet sequence (binary)
40 * 0x0000-0x007F 0xxxxxxx
41 * 0x0080-0x07FF 110xxxxx 10xxxxxx
42 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
43 *
44 * RFC 2044
45 * UTF-8,a transformation format of UNICODE and ISO 10646
46 * F. Yergeau
47 * Alis Technologies
48 * October 1996
49 */
50
51 #ifdef _KERNEL
52 #include <sys/types.h>
53 #include <sys/sunddi.h>
54 #else
55 #include <stdio.h>
56 #include <stdlib.h>
57 #include <assert.h>
58 #include <strings.h>
59 #endif
60 #include <smbsrv/string.h>
61
62
63 /*
64 * mbstowcs
65 *
66 * The mbstowcs() function converts a multibyte character string
67 * mbstring into a wide character string wcstring. No more than
68 * nwchars wide characters are stored. A terminating null wide
69 * character is appended if there is room.
70 *
71 * Returns the number of wide characters converted, not counting
72 * any terminating null wide character. Returns -1 if an invalid
73 * multibyte character is encountered.
74 */
75 size_t
smb_mbstowcs(smb_wchar_t * wcstring,const char * mbstring,size_t nwchars)76 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
77 {
78 int len;
79 smb_wchar_t *start = wcstring;
80
81 while (nwchars--) {
82 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
83 if (len < 0) {
84 *wcstring = 0;
85 return ((size_t)-1);
86 }
87
88 if (*mbstring == 0)
89 break;
90
91 ++wcstring;
92 mbstring += len;
93 }
94
95 return (wcstring - start);
96 }
97
98
99 /*
100 * mbtowc
101 *
102 * The mbtowc() function converts a multibyte character mbchar into
103 * a wide character and stores the result in the object pointed to
104 * by wcharp. Up to nbytes bytes are examined.
105 *
106 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
107 * states are not supported. Shift states are used to switch between
108 * representation modes using reserved bytes to signal shifting
109 * without them being interpreted as characters. If mbchar is null
110 * mbtowc should return non-zero if the current locale requires shift
111 * states. Otherwise it should be return 0.
112 *
113 * If mbchar is non-null, returns the number of bytes processed in
114 * mbchar. If mbchar is invalid, returns -1.
115 */
116 int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t * wcharp,const char * mbchar,size_t nbytes)117 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
118 {
119 unsigned char mbyte;
120 smb_wchar_t wide_char;
121 int count;
122 int bytes_left;
123
124 if (mbchar == NULL)
125 return (0); /* no shift states */
126
127 /* 0xxxxxxx -> 1 byte ASCII encoding */
128 if (((mbyte = *mbchar++) & 0x80) == 0) {
129 if (wcharp)
130 *wcharp = (smb_wchar_t)mbyte;
131
132 return (mbyte ? 1 : 0);
133 }
134
135 /* 10xxxxxx -> invalid first byte */
136 if ((mbyte & 0x40) == 0)
137 return (-1);
138
139 wide_char = mbyte;
140 if ((mbyte & 0x20) == 0) {
141 wide_char &= 0x1f;
142 bytes_left = 1;
143 } else if ((mbyte & 0x10) == 0) {
144 wide_char &= 0x0f;
145 bytes_left = 2;
146 } else {
147 return (-1);
148 }
149
150 count = 1;
151 while (bytes_left--) {
152 if (((mbyte = *mbchar++) & 0xc0) != 0x80)
153 return (-1);
154
155 count++;
156 wide_char = (wide_char << 6) | (mbyte & 0x3f);
157 }
158
159 if (wcharp)
160 *wcharp = wide_char;
161
162 return (count);
163 }
164
165
166 /*
167 * wctomb
168 *
169 * The wctomb() function converts a wide character wchar into a multibyte
170 * character and stores the result in mbchar. The object pointed to by
171 * mbchar must be large enough to accommodate the multibyte character.
172 *
173 * Returns the numberof bytes written to mbchar.
174 */
175 int
smb_wctomb(char * mbchar,smb_wchar_t wchar)176 smb_wctomb(char *mbchar, smb_wchar_t wchar)
177 {
178 if ((wchar & ~0x7f) == 0) {
179 *mbchar = (char)wchar;
180 return (1);
181 }
182
183 if ((wchar & ~0x7ff) == 0) {
184 *mbchar++ = (wchar >> 6) | 0xc0;
185 *mbchar = (wchar & 0x3f) | 0x80;
186 return (2);
187 }
188
189 *mbchar++ = (wchar >> 12) | 0xe0;
190 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
191 *mbchar = (wchar & 0x3f) | 0x80;
192 return (3);
193 }
194
195
196 /*
197 * wcstombs
198 *
199 * The wcstombs() function converts a wide character string wcstring
200 * into a multibyte character string mbstring. Up to nbytes bytes are
201 * stored in mbstring. Partial multibyte characters at the end of the
202 * string are not stored. The multibyte character string is null
203 * terminated if there is room.
204 *
205 * Returns the number of bytes converted, not counting the terminating
206 * null byte.
207 */
208 size_t
smb_wcstombs(char * mbstring,const smb_wchar_t * wcstring,size_t nbytes)209 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
210 {
211 char *start = mbstring;
212 const smb_wchar_t *wcp = wcstring;
213 smb_wchar_t wide_char;
214 char buf[4];
215 size_t len;
216
217 if ((mbstring == NULL) || (wcstring == NULL))
218 return (0);
219
220 while (nbytes > MTS_MB_CHAR_MAX) {
221 wide_char = *wcp++;
222 len = smb_wctomb(mbstring, wide_char);
223
224 if (wide_char == 0)
225 /*LINTED E_PTRDIFF_OVERFLOW*/
226 return (mbstring - start);
227
228 mbstring += len;
229 nbytes -= len;
230 }
231
232 while (wide_char && nbytes) {
233 wide_char = *wcp++;
234 if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
235 *mbstring = 0;
236 break;
237 }
238
239 bcopy(buf, mbstring, len);
240 mbstring += len;
241 nbytes -= len;
242 }
243
244 /*LINTED E_PTRDIFF_OVERFLOW*/
245 return (mbstring - start);
246 }
247
248
249 /*
250 * Returns the number of bytes that would be written if the multi-
251 * byte string mbs was converted to a wide character string, not
252 * counting the terminating null wide character.
253 */
254 size_t
smb_wcequiv_strlen(const char * mbs)255 smb_wcequiv_strlen(const char *mbs)
256 {
257 smb_wchar_t wide_char;
258 size_t bytes;
259 size_t len = 0;
260
261 while (*mbs) {
262 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
263 if (bytes == ((size_t)-1))
264 return ((size_t)-1);
265
266 len += sizeof (smb_wchar_t);
267 mbs += bytes;
268 }
269
270 return (len);
271 }
272
273
274 /*
275 * Returns the number of bytes that would be written if the multi-
276 * byte string mbs was converted to a single byte character string,
277 * not counting the terminating null character.
278 */
279 size_t
smb_sbequiv_strlen(const char * mbs)280 smb_sbequiv_strlen(const char *mbs)
281 {
282 smb_wchar_t wide_char;
283 size_t nbytes;
284 size_t len = 0;
285
286 while (*mbs) {
287 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
288 if (nbytes == ((size_t)-1))
289 return ((size_t)-1);
290
291 if (wide_char & 0xFF00)
292 len += sizeof (smb_wchar_t);
293 else
294 ++len;
295
296 mbs += nbytes;
297 }
298
299 return (len);
300 }
301
302
303 /*
304 * stombs
305 *
306 * Convert a regular null terminated string 'string' to a UTF-8 encoded
307 * null terminated multi-byte string 'mbstring'. Only full converted
308 * UTF-8 characters will be written 'mbstring'. If a character will not
309 * fit within the remaining buffer space or 'mbstring' will overflow
310 * max_mblen, the conversion process will be terminated and 'mbstring'
311 * will be null terminated.
312 *
313 * Returns the number of bytes written to 'mbstring', excluding the
314 * terminating null character.
315 *
316 * If either mbstring or string is a null pointer, -1 is returned.
317 */
318 int
smb_stombs(char * mbstring,char * string,int max_mblen)319 smb_stombs(char *mbstring, char *string, int max_mblen)
320 {
321 char *start = mbstring;
322 unsigned char *p = (unsigned char *)string;
323 int space_left = max_mblen;
324 int len;
325 smb_wchar_t wide_char;
326 char buf[4];
327
328 if (!mbstring || !string)
329 return (-1);
330
331 while (*p && space_left > 2) {
332 wide_char = *p++;
333 len = smb_wctomb(mbstring, wide_char);
334 mbstring += len;
335 space_left -= len;
336 }
337
338 if (*p) {
339 wide_char = *p;
340 if ((len = smb_wctomb(buf, wide_char)) < 2) {
341 *mbstring = *buf;
342 mbstring += len;
343 space_left -= len;
344 }
345 }
346
347 *mbstring = '\0';
348
349 /*LINTED E_PTRDIFF_OVERFLOW*/
350 return (mbstring - start);
351 }
352
353
354 /*
355 * mbstos
356 *
357 * Convert a null terminated multi-byte string 'mbstring' to a regular
358 * null terminated string 'string'. A 1-byte character in 'mbstring'
359 * maps to a 1-byte character in 'string'. A 2-byte character in
360 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
361 * Otherwise the upper byte null will be discarded to ensure that the
362 * output stream does not contain embedded null characters.
363 *
364 * If the input stream contains invalid multi-byte characters, a value
365 * of -1 will be returned. Otherwise the length of 'string', excluding
366 * the terminating null character, is returned.
367 *
368 * If either mbstring or string is a null pointer, -1 is returned.
369 */
370 int
smb_mbstos(char * string,const char * mbstring)371 smb_mbstos(char *string, const char *mbstring)
372 {
373 smb_wchar_t wc;
374 unsigned char *start = (unsigned char *)string;
375 int len;
376
377 if (string == NULL || mbstring == NULL)
378 return (-1);
379
380 while (*mbstring) {
381 if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
382 *string = 0;
383 return (-1);
384 }
385
386 if (wc & 0xFF00) {
387 /*LINTED E_BAD_PTR_CAST_ALIGN*/
388 *((smb_wchar_t *)string) = wc;
389 string += sizeof (smb_wchar_t);
390 }
391 else
392 {
393 *string = (unsigned char)wc;
394 string++;
395 }
396
397 mbstring += len;
398 }
399
400 *string = 0;
401
402 /*LINTED E_PTRDIFF_OVERFLOW*/
403 return ((unsigned char *)string - start);
404 }
405