xref: /onnv-gate/usr/src/common/smbsrv/smb_utf8.c (revision 10966:37e5dcdf36d3)
15331Samw /*
25331Samw  * CDDL HEADER START
35331Samw  *
45331Samw  * The contents of this file are subject to the terms of the
55331Samw  * Common Development and Distribution License (the "License").
65331Samw  * You may not use this file except in compliance with the License.
75331Samw  *
85331Samw  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95331Samw  * or http://www.opensolaris.org/os/licensing.
105331Samw  * See the License for the specific language governing permissions
115331Samw  * and limitations under the License.
125331Samw  *
135331Samw  * When distributing Covered Code, include this CDDL HEADER in each
145331Samw  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155331Samw  * If applicable, add the following below this CDDL HEADER, with the
165331Samw  * fields enclosed by brackets "[]" replaced with your own identifying
175331Samw  * information: Portions Copyright [yyyy] [name of copyright owner]
185331Samw  *
195331Samw  * CDDL HEADER END
205331Samw  */
215331Samw /*
22*10966SJordan.Brown@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
235331Samw  * Use is subject to license terms.
245331Samw  */
255331Samw 
265331Samw /*
275331Samw  * Multibyte/wide-char conversion routines. Wide-char encoding provides
285331Samw  * a fixed size character encoding that maps to the Unicode 16-bit
295331Samw  * (UCS-2) character set standard. Multibyte or UCS transformation
305331Samw  * format (UTF) encoding is a variable length character encoding scheme
315331Samw  * that s compatible with existing ASCII characters and guarantees that
325331Samw  * the resultant strings do not contain embedded null characters. Both
335331Samw  * types of encoding provide a null terminator: single byte for UTF-8
345331Samw  * and a wide-char null for Unicode. See RFC 2044.
355331Samw  *
365331Samw  * The table below illustrates the UTF-8 encoding scheme. The letter x
375331Samw  * indicates bits available for encoding the character value.
385331Samw  *
395331Samw  *	UCS-2			UTF-8 octet sequence (binary)
405331Samw  *	0x0000-0x007F	0xxxxxxx
415331Samw  *	0x0080-0x07FF	110xxxxx 10xxxxxx
425331Samw  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
435331Samw  *
445331Samw  * RFC 2044
455331Samw  * UTF-8,a transformation format of UNICODE and ISO 10646
465331Samw  * F. Yergeau
475331Samw  * Alis Technologies
485331Samw  * October 1996
495331Samw  */
505331Samw 
515331Samw #ifdef _KERNEL
525331Samw #include <sys/types.h>
535331Samw #include <sys/sunddi.h>
545331Samw #else
555331Samw #include <stdio.h>
565331Samw #include <stdlib.h>
575331Samw #include <assert.h>
585331Samw #include <strings.h>
595331Samw #endif
605331Samw #include <smbsrv/string.h>
615331Samw 
625331Samw 
635331Samw /*
645331Samw  * mbstowcs
655331Samw  *
665331Samw  * The mbstowcs() function converts a multibyte character string
675331Samw  * mbstring into a wide character string wcstring. No more than
685331Samw  * nwchars wide characters are stored. A terminating null wide
695331Samw  * character is appended if there is room.
705331Samw  *
715331Samw  * Returns the number of wide characters converted, not counting
725331Samw  * any terminating null wide character. Returns -1 if an invalid
735331Samw  * multibyte character is encountered.
745331Samw  */
755331Samw size_t
smb_mbstowcs(smb_wchar_t * wcstring,const char * mbstring,size_t nwchars)76*10966SJordan.Brown@Sun.COM smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
775331Samw {
785331Samw 	int len;
79*10966SJordan.Brown@Sun.COM 	smb_wchar_t	*start = wcstring;
805331Samw 
815331Samw 	while (nwchars--) {
82*10966SJordan.Brown@Sun.COM 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
835331Samw 		if (len < 0) {
845331Samw 			*wcstring = 0;
855331Samw 			return ((size_t)-1);
865331Samw 		}
875331Samw 
885331Samw 		if (*mbstring == 0)
895331Samw 			break;
905331Samw 
915331Samw 		++wcstring;
925331Samw 		mbstring += len;
935331Samw 	}
945331Samw 
955331Samw 	return (wcstring - start);
965331Samw }
975331Samw 
985331Samw 
995331Samw /*
1005331Samw  * mbtowc
1015331Samw  *
1025331Samw  * The mbtowc() function converts a multibyte character mbchar into
1035331Samw  * a wide character and stores the result in the object pointed to
1045331Samw  * by wcharp. Up to nbytes bytes are examined.
1055331Samw  *
1065331Samw  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
1075521Sas200622  * states are not supported.  Shift states are used to switch between
1085521Sas200622  * representation modes using reserved bytes to signal shifting
1095521Sas200622  * without them being interpreted as characters.  If mbchar is null
1105521Sas200622  * mbtowc should return non-zero if the current locale requires shift
1115521Sas200622  * states.  Otherwise it should be return 0.
1125521Sas200622  *
1135521Sas200622  * If mbchar is non-null, returns the number of bytes processed in
1145521Sas200622  * mbchar.  If mbchar is invalid, returns -1.
1155331Samw  */
1165331Samw int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t * wcharp,const char * mbchar,size_t nbytes)117*10966SJordan.Brown@Sun.COM smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
1185331Samw {
1195331Samw 	unsigned char mbyte;
120*10966SJordan.Brown@Sun.COM 	smb_wchar_t wide_char;
1215331Samw 	int count;
1225331Samw 	int bytes_left;
1235331Samw 
1245521Sas200622 	if (mbchar == NULL)
1255521Sas200622 		return (0); /* no shift states */
1265331Samw 
1275331Samw 	/* 0xxxxxxx -> 1 byte ASCII encoding */
1285331Samw 	if (((mbyte = *mbchar++) & 0x80) == 0) {
1295331Samw 		if (wcharp)
130*10966SJordan.Brown@Sun.COM 			*wcharp = (smb_wchar_t)mbyte;
1315331Samw 
1325331Samw 		return (mbyte ? 1 : 0);
1335331Samw 	}
1345331Samw 
1355331Samw 	/* 10xxxxxx -> invalid first byte */
1365521Sas200622 	if ((mbyte & 0x40) == 0)
1375331Samw 		return (-1);
1385331Samw 
1395331Samw 	wide_char = mbyte;
1405331Samw 	if ((mbyte & 0x20) == 0) {
1415331Samw 		wide_char &= 0x1f;
1425331Samw 		bytes_left = 1;
1435331Samw 	} else if ((mbyte & 0x10) == 0) {
1445331Samw 		wide_char &= 0x0f;
1455331Samw 		bytes_left = 2;
1465331Samw 	} else {
1475331Samw 		return (-1);
1485331Samw 	}
1495331Samw 
1505331Samw 	count = 1;
1515331Samw 	while (bytes_left--) {
1525521Sas200622 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
1535331Samw 			return (-1);
1545331Samw 
1555331Samw 		count++;
1565331Samw 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
1575331Samw 	}
1585331Samw 
1595331Samw 	if (wcharp)
1605331Samw 		*wcharp = wide_char;
1615331Samw 
1625331Samw 	return (count);
1635331Samw }
1645331Samw 
1655331Samw 
1665331Samw /*
1675331Samw  * wctomb
1685331Samw  *
1695331Samw  * The wctomb() function converts a wide character wchar into a multibyte
1705331Samw  * character and stores the result in mbchar. The object pointed to by
1715331Samw  * mbchar must be large enough to accommodate the multibyte character.
1725331Samw  *
1735331Samw  * Returns the numberof bytes written to mbchar.
1745331Samw  */
1755331Samw int
smb_wctomb(char * mbchar,smb_wchar_t wchar)176*10966SJordan.Brown@Sun.COM smb_wctomb(char *mbchar, smb_wchar_t wchar)
1775331Samw {
1785331Samw 	if ((wchar & ~0x7f) == 0) {
1795331Samw 		*mbchar = (char)wchar;
1805331Samw 		return (1);
1815331Samw 	}
1825331Samw 
1835331Samw 	if ((wchar & ~0x7ff) == 0) {
1845331Samw 		*mbchar++ = (wchar >> 6) | 0xc0;
1855331Samw 		*mbchar = (wchar & 0x3f) | 0x80;
1865331Samw 		return (2);
1875331Samw 	}
1885331Samw 
1895331Samw 	*mbchar++ = (wchar >> 12) | 0xe0;
1905331Samw 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
1915331Samw 	*mbchar = (wchar & 0x3f) | 0x80;
1925331Samw 	return (3);
1935331Samw }
1945331Samw 
1955331Samw 
1965331Samw /*
1975331Samw  * wcstombs
1985331Samw  *
1995331Samw  * The wcstombs() function converts a wide character string wcstring
2005331Samw  * into a multibyte character string mbstring. Up to nbytes bytes are
2015331Samw  * stored in mbstring. Partial multibyte characters at the end of the
2025331Samw  * string are not stored. The multibyte character string is null
2035331Samw  * terminated if there is room.
2045331Samw  *
2055331Samw  * Returns the number of bytes converted, not counting the terminating
2065331Samw  * null byte.
2075331Samw  */
2085331Samw size_t
smb_wcstombs(char * mbstring,const smb_wchar_t * wcstring,size_t nbytes)209*10966SJordan.Brown@Sun.COM smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
2105331Samw {
2115331Samw 	char *start = mbstring;
212*10966SJordan.Brown@Sun.COM 	const smb_wchar_t *wcp = wcstring;
213*10966SJordan.Brown@Sun.COM 	smb_wchar_t wide_char;
2145331Samw 	char buf[4];
2155331Samw 	size_t len;
2165331Samw 
2175521Sas200622 	if ((mbstring == NULL) || (wcstring == NULL))
2185331Samw 		return (0);
2195331Samw 
2205331Samw 	while (nbytes > MTS_MB_CHAR_MAX) {
2215331Samw 		wide_char = *wcp++;
222*10966SJordan.Brown@Sun.COM 		len = smb_wctomb(mbstring, wide_char);
2235331Samw 
2245331Samw 		if (wide_char == 0)
2255331Samw 			/*LINTED E_PTRDIFF_OVERFLOW*/
2265331Samw 			return (mbstring - start);
2275331Samw 
2285331Samw 		mbstring += len;
2295331Samw 		nbytes -= len;
2305331Samw 	}
2315331Samw 
2325331Samw 	while (wide_char && nbytes) {
2335331Samw 		wide_char = *wcp++;
234*10966SJordan.Brown@Sun.COM 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
2355331Samw 			*mbstring = 0;
2365331Samw 			break;
2375331Samw 		}
2385331Samw 
2395331Samw 		bcopy(buf, mbstring, len);
2405331Samw 		mbstring += len;
2415331Samw 		nbytes -= len;
2425331Samw 	}
2435331Samw 
2445331Samw 	/*LINTED E_PTRDIFF_OVERFLOW*/
2455331Samw 	return (mbstring - start);
2465331Samw }
2475331Samw 
2485331Samw 
2495331Samw /*
2505331Samw  * Returns the number of bytes that would be written if the multi-
2515331Samw  * byte string mbs was converted to a wide character string, not
2525331Samw  * counting the terminating null wide character.
2535331Samw  */
2545331Samw size_t
smb_wcequiv_strlen(const char * mbs)255*10966SJordan.Brown@Sun.COM smb_wcequiv_strlen(const char *mbs)
2565331Samw {
257*10966SJordan.Brown@Sun.COM 	smb_wchar_t	wide_char;
2585331Samw 	size_t bytes;
2595331Samw 	size_t len = 0;
2605331Samw 
2615331Samw 	while (*mbs) {
262*10966SJordan.Brown@Sun.COM 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
2635331Samw 		if (bytes == ((size_t)-1))
2645331Samw 			return ((size_t)-1);
2655331Samw 
266*10966SJordan.Brown@Sun.COM 		len += sizeof (smb_wchar_t);
2675331Samw 		mbs += bytes;
2685331Samw 	}
2695331Samw 
2705331Samw 	return (len);
2715331Samw }
2725331Samw 
2735331Samw 
2745331Samw /*
2755331Samw  * Returns the number of bytes that would be written if the multi-
2765331Samw  * byte string mbs was converted to a single byte character string,
2775331Samw  * not counting the terminating null character.
2785331Samw  */
2795331Samw size_t
smb_sbequiv_strlen(const char * mbs)280*10966SJordan.Brown@Sun.COM smb_sbequiv_strlen(const char *mbs)
2815331Samw {
282*10966SJordan.Brown@Sun.COM 	smb_wchar_t	wide_char;
2835331Samw 	size_t nbytes;
2845331Samw 	size_t len = 0;
2855331Samw 
2865331Samw 	while (*mbs) {
287*10966SJordan.Brown@Sun.COM 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
2885331Samw 		if (nbytes == ((size_t)-1))
2895331Samw 			return ((size_t)-1);
2905331Samw 
2915331Samw 		if (wide_char & 0xFF00)
292*10966SJordan.Brown@Sun.COM 			len += sizeof (smb_wchar_t);
2935331Samw 		else
2945331Samw 			++len;
2955331Samw 
2965331Samw 		mbs += nbytes;
2975331Samw 	}
2985331Samw 
2995331Samw 	return (len);
3005331Samw }
3015331Samw 
3025331Samw 
3035331Samw /*
3045331Samw  * stombs
3055331Samw  *
3065331Samw  * Convert a regular null terminated string 'string' to a UTF-8 encoded
3075331Samw  * null terminated multi-byte string 'mbstring'. Only full converted
3085331Samw  * UTF-8 characters will be written 'mbstring'. If a character will not
3095331Samw  * fit within the remaining buffer space or 'mbstring' will overflow
3105331Samw  * max_mblen, the conversion process will be terminated and 'mbstring'
3115331Samw  * will be null terminated.
3125331Samw  *
3135331Samw  * Returns the number of bytes written to 'mbstring', excluding the
3145331Samw  * terminating null character.
3155331Samw  *
3165331Samw  * If either mbstring or string is a null pointer, -1 is returned.
3175331Samw  */
3185331Samw int
smb_stombs(char * mbstring,char * string,int max_mblen)319*10966SJordan.Brown@Sun.COM smb_stombs(char *mbstring, char *string, int max_mblen)
3205331Samw {
3215331Samw 	char *start = mbstring;
3225331Samw 	unsigned char *p = (unsigned char *)string;
3235331Samw 	int space_left = max_mblen;
3245331Samw 	int	len;
325*10966SJordan.Brown@Sun.COM 	smb_wchar_t	wide_char;
3265331Samw 	char buf[4];
3275331Samw 
3285331Samw 	if (!mbstring || !string)
3295331Samw 		return (-1);
3305331Samw 
3315331Samw 	while (*p && space_left > 2) {
3325331Samw 		wide_char = *p++;
333*10966SJordan.Brown@Sun.COM 		len = smb_wctomb(mbstring, wide_char);
3345331Samw 		mbstring += len;
3355331Samw 		space_left -= len;
3365331Samw 	}
3375331Samw 
3385331Samw 	if (*p) {
3395331Samw 		wide_char = *p;
340*10966SJordan.Brown@Sun.COM 		if ((len = smb_wctomb(buf, wide_char)) < 2) {
3415331Samw 			*mbstring = *buf;
3425331Samw 			mbstring += len;
3435331Samw 			space_left -= len;
3445331Samw 		}
3455331Samw 	}
3465331Samw 
3475331Samw 	*mbstring = '\0';
3485331Samw 
3495331Samw 	/*LINTED E_PTRDIFF_OVERFLOW*/
3505331Samw 	return (mbstring - start);
3515331Samw }
3525331Samw 
3535331Samw 
3545331Samw /*
3555331Samw  * mbstos
3565331Samw  *
3575331Samw  * Convert a null terminated multi-byte string 'mbstring' to a regular
3585331Samw  * null terminated string 'string'.  A 1-byte character in 'mbstring'
3595331Samw  * maps to a 1-byte character in 'string'. A 2-byte character in
3605331Samw  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
3615331Samw  * Otherwise the upper byte null will be discarded to ensure that the
3625331Samw  * output stream does not contain embedded null characters.
3635331Samw  *
3645331Samw  * If the input stream contains invalid multi-byte characters, a value
3655331Samw  * of -1 will be returned. Otherwise the length of 'string', excluding
3665331Samw  * the terminating null character, is returned.
3675331Samw  *
3685331Samw  * If either mbstring or string is a null pointer, -1 is returned.
3695331Samw  */
3705331Samw int
smb_mbstos(char * string,const char * mbstring)371*10966SJordan.Brown@Sun.COM smb_mbstos(char *string, const char *mbstring)
3725331Samw {
373*10966SJordan.Brown@Sun.COM 	smb_wchar_t wc;
3745331Samw 	unsigned char *start = (unsigned char *)string;
3755331Samw 	int len;
3765331Samw 
3775521Sas200622 	if (string == NULL || mbstring == NULL)
3785331Samw 		return (-1);
3795331Samw 
3805331Samw 	while (*mbstring) {
381*10966SJordan.Brown@Sun.COM 		if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
3825331Samw 			*string = 0;
3835331Samw 			return (-1);
3845331Samw 		}
3855331Samw 
3865331Samw 		if (wc & 0xFF00) {
3875331Samw 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
388*10966SJordan.Brown@Sun.COM 			*((smb_wchar_t *)string) = wc;
389*10966SJordan.Brown@Sun.COM 			string += sizeof (smb_wchar_t);
3905331Samw 		}
3915331Samw 		else
3925331Samw 		{
3935331Samw 			*string = (unsigned char)wc;
3945331Samw 			string++;
3955331Samw 		}
3965331Samw 
3975331Samw 		mbstring += len;
3985331Samw 	}
3995331Samw 
4005331Samw 	*string = 0;
4015331Samw 
4025331Samw 	/*LINTED E_PTRDIFF_OVERFLOW*/
4035331Samw 	return ((unsigned char *)string - start);
4045331Samw }
405