xref: /onnv-gate/usr/src/common/unicode/uconv.c (revision 6008:3a1c10482cf2)
15049Sis /*
25049Sis  * CDDL HEADER START
35049Sis  *
45049Sis  * The contents of this file are subject to the terms of the
55049Sis  * Common Development and Distribution License (the "License").
65049Sis  * You may not use this file except in compliance with the License.
75049Sis  *
85049Sis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95049Sis  * or http://www.opensolaris.org/os/licensing.
105049Sis  * See the License for the specific language governing permissions
115049Sis  * and limitations under the License.
125049Sis  *
135049Sis  * When distributing Covered Code, include this CDDL HEADER in each
145049Sis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155049Sis  * If applicable, add the following below this CDDL HEADER, with the
165049Sis  * fields enclosed by brackets "[]" replaced with your own identifying
175049Sis  * information: Portions Copyright [yyyy] [name of copyright owner]
185049Sis  *
195049Sis  * CDDL HEADER END
205049Sis  */
215049Sis /*
22*6008Syy154373  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
235049Sis  * Use is subject to license terms.
245049Sis  */
255049Sis 
265049Sis #pragma ident	"%Z%%M%	%I%	%E% SMI"
275049Sis 
285049Sis /*
295049Sis  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
305049Sis  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
315049Sis  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
325049Sis  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
335049Sis  * the section 3C man pages.
345049Sis  * Interface stability: Committed
355049Sis  */
365049Sis 
375049Sis #include <sys/types.h>
385049Sis #ifdef	_KERNEL
395049Sis #include <sys/param.h>
405049Sis #include <sys/sysmacros.h>
415049Sis #include <sys/systm.h>
425049Sis #include <sys/debug.h>
435049Sis #include <sys/kmem.h>
445049Sis #include <sys/sunddi.h>
455049Sis #else
465049Sis #include <sys/u8_textprep.h>
475049Sis #endif	/* _KERNEL */
485049Sis #include <sys/byteorder.h>
495049Sis #include <sys/errno.h>
505049Sis 
515049Sis 
525049Sis /*
535049Sis  * The max and min values of high and low surrogate pairs of UTF-16,
545049Sis  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
555049Sis  */
565049Sis #define	UCONV_U16_HI_MIN	(0xd800U)
575049Sis #define	UCONV_U16_HI_MAX	(0xdbffU)
585049Sis #define	UCONV_U16_LO_MIN	(0xdc00U)
595049Sis #define	UCONV_U16_LO_MAX	(0xdfffU)
605049Sis #define	UCONV_U16_BIT_SHIFT	(0x0400U)
615049Sis #define	UCONV_U16_BIT_MASK	(0x0fffffU)
625049Sis #define	UCONV_U16_START		(0x010000U)
635049Sis 
645049Sis /* The maximum value of Unicode coding space and ASCII coding space. */
655049Sis #define	UCONV_UNICODE_MAX	(0x10ffffU)
665049Sis #define	UCONV_ASCII_MAX		(0x7fU)
675049Sis 
685049Sis /* The mask values for input and output endians. */
695049Sis #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
705049Sis #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
715049Sis 
725049Sis /* Native and reversed endian macros. */
735049Sis #ifdef	_BIG_ENDIAN
745049Sis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
755049Sis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
765049Sis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
775049Sis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
785049Sis #else
795049Sis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
805049Sis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
815049Sis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
825049Sis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
835049Sis #endif	/* _BIG_ENDIAN */
845049Sis 
855049Sis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
865049Sis #define	UCONV_BOM_NORMAL	(0xfeffU)
875049Sis #define	UCONV_BOM_SWAPPED	(0xfffeU)
885049Sis #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
895049Sis 
905049Sis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
915049Sis #define	UCONV_U8_ONE_BYTE	(0x7fU)
925049Sis #define	UCONV_U8_TWO_BYTES	(0x7ffU)
935049Sis #define	UCONV_U8_THREE_BYTES	(0xffffU)
945049Sis #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
955049Sis 
965049Sis /* The common minimum and maximum values at the UTF-8 character bytes. */
975049Sis #define	UCONV_U8_BYTE_MIN	(0x80U)
985049Sis #define	UCONV_U8_BYTE_MAX	(0xbfU)
995049Sis 
1005049Sis /*
1015049Sis  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1025049Sis  * UTF-8 character bytes.
1035049Sis  */
1045049Sis #define	UCONV_U8_BIT_SHIFT	6
1055049Sis #define	UCONV_U8_BIT_MASK	0x3f
1065049Sis 
1075049Sis /*
1085049Sis  * The following vector shows remaining bytes in a UTF-8 character.
1095049Sis  * Index will be the first byte of the character.
1105049Sis  */
1115049Sis static const uchar_t remaining_bytes_tbl[0x100] = {
1125049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1135049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1145049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1155049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1165049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1175049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1185049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1195049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1205049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1215049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1225049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1235049Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1245049Sis 
1255049Sis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
1265049Sis 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1275049Sis 
1285049Sis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
1295049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1305049Sis 
1315049Sis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
1325049Sis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
1335049Sis 
1345049Sis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
1355049Sis 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1365049Sis };
1375049Sis 
1385049Sis /*
1395049Sis  * The following is a vector of bit-masks to get used bits in
1405049Sis  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
1415049Sis  * the character.
1425049Sis  */
143*6008Syy154373 #ifdef	_KERNEL
144*6008Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*6008Syy154373 #else
146*6008Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147*6008Syy154373 #endif	/* _KERNEL */
1485049Sis 
1495049Sis /*
1505049Sis  * The following two vectors are to provide valid minimum and
1515049Sis  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1525049Sis  * better illegal sequence checking. The index value must be the value of
1535049Sis  * the first byte of the UTF-8 character.
1545049Sis  */
1555049Sis static const uchar_t valid_min_2nd_byte[0x100] = {
1565049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1575049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1585049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1595049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1605049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1615049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1625049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1635049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1645049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1655049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1665049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1675049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1685049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1695049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1705049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1715049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1725049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1735049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1745049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1755049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1765049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1775049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1785049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1795049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
1805049Sis 
1815049Sis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
1825049Sis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1835049Sis 
1845049Sis /*	C8    C9    CA    CB    CC    CD    CE    CF */
1855049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1865049Sis 
1875049Sis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
1885049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1895049Sis 
1905049Sis /*	D8    D9    DA    DB    DC    DD    DE    DF */
1915049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1925049Sis 
1935049Sis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
1945049Sis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1955049Sis 
1965049Sis /*	E8    E9    EA    EB    EC    ED    EE    EF */
1975049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1985049Sis 
1995049Sis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
2005049Sis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2015049Sis 
2025049Sis 	0,    0,    0,    0,    0,    0,    0,    0
2035049Sis };
2045049Sis 
2055049Sis static const uchar_t valid_max_2nd_byte[0x100] = {
2065049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2075049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2085049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2095049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2105049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2115049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2125049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2135049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2145049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2155049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2165049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2175049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2185049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2195049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2205049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2215049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2225049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2235049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2245049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2255049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2265049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2275049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2285049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2295049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2305049Sis 
2315049Sis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
2325049Sis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2335049Sis 
2345049Sis /*	C8    C9    CA    CB    CC    CD    CE    CF */
2355049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2365049Sis 
2375049Sis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
2385049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2395049Sis 
2405049Sis /*	D8    D9    DA    DB    DC    DD    DE    DF */
2415049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2425049Sis 
2435049Sis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
2445049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2455049Sis 
2465049Sis /*	E8    E9    EA    EB    EC    ED    EE    EF */
2475049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2485049Sis 
2495049Sis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
2505049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
2515049Sis 
2525049Sis 	0,    0,    0,    0,    0,    0,    0,    0
2535049Sis };
2545049Sis 
2555049Sis 
2565049Sis static int
check_endian(int flag,int * in,int * out)2575049Sis check_endian(int flag, int *in, int *out)
2585049Sis {
2595049Sis 	*in = flag & UCONV_IN_ENDIAN_MASKS;
2605049Sis 
2615049Sis 	/* You cannot have both. */
2625049Sis 	if (*in == UCONV_IN_ENDIAN_MASKS)
2635049Sis 		return (EBADF);
2645049Sis 
2655049Sis 	if (*in == 0)
2665049Sis 		*in = UCONV_IN_NAT_ENDIAN;
2675049Sis 
2685049Sis 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
2695049Sis 
2705049Sis 	/* You cannot have both. */
2715049Sis 	if (*out == UCONV_OUT_ENDIAN_MASKS)
2725049Sis 		return (EBADF);
2735049Sis 
2745049Sis 	if (*out == 0)
2755049Sis 		*out = UCONV_OUT_NAT_ENDIAN;
2765049Sis 
2775049Sis 	return (0);
2785049Sis }
2795049Sis 
2805049Sis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2815049Sis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2825049Sis {
2835049Sis 	if (u16l > 0) {
2845049Sis 		if (*u16s == UCONV_BOM_NORMAL) {
2855049Sis 			*in = UCONV_IN_NAT_ENDIAN;
2865049Sis 			return (B_TRUE);
2875049Sis 		}
2885049Sis 		if (*u16s == UCONV_BOM_SWAPPED) {
2895049Sis 			*in = UCONV_IN_REV_ENDIAN;
2905049Sis 			return (B_TRUE);
2915049Sis 		}
2925049Sis 	}
2935049Sis 
2945049Sis 	return (B_FALSE);
2955049Sis }
2965049Sis 
2975049Sis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2985049Sis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2995049Sis {
3005049Sis 	if (u32l > 0) {
3015049Sis 		if (*u32s == UCONV_BOM_NORMAL) {
3025049Sis 			*in = UCONV_IN_NAT_ENDIAN;
3035049Sis 			return (B_TRUE);
3045049Sis 		}
3055049Sis 		if (*u32s == UCONV_BOM_SWAPPED_32) {
3065049Sis 			*in = UCONV_IN_REV_ENDIAN;
3075049Sis 			return (B_TRUE);
3085049Sis 		}
3095049Sis 	}
3105049Sis 
3115049Sis 	return (B_FALSE);
3125049Sis }
3135049Sis 
3145049Sis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3155049Sis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3165049Sis     uint32_t *u32s, size_t *utf32len, int flag)
3175049Sis {
3185049Sis 	int inendian;
3195049Sis 	int outendian;
3205049Sis 	size_t u16l;
3215049Sis 	size_t u32l;
3225049Sis 	uint32_t hi;
3235049Sis 	uint32_t lo;
3245049Sis 	boolean_t do_not_ignore_null;
3255049Sis 
3265049Sis 	/*
3275049Sis 	 * Do preliminary validity checks on parameters and collect info on
3285049Sis 	 * endians.
3295049Sis 	 */
3305049Sis 	if (u16s == NULL || utf16len == NULL)
3315049Sis 		return (EILSEQ);
3325049Sis 
3335049Sis 	if (u32s == NULL || utf32len == NULL)
3345049Sis 		return (E2BIG);
3355049Sis 
3365049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
3375049Sis 		return (EBADF);
3385049Sis 
3395049Sis 	/*
3405049Sis 	 * Initialize input and output parameter buffer indices and
3415049Sis 	 * temporary variables.
3425049Sis 	 */
3435049Sis 	u16l = u32l = 0;
3445049Sis 	hi = 0;
3455049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3465049Sis 
3475049Sis 	/*
3485049Sis 	 * Check on the BOM at the beginning of the input buffer if required
3495049Sis 	 * and if there is indeed one, process it.
3505049Sis 	 */
3515049Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
3525049Sis 	    check_bom16(u16s, *utf16len, &inendian))
3535049Sis 		u16l++;
3545049Sis 
3555049Sis 	/*
3565049Sis 	 * Reset inendian and outendian so that after this point, those can be
3575049Sis 	 * used as condition values.
3585049Sis 	 */
3595049Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
3605049Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
3615049Sis 
3625049Sis 	/*
3635049Sis 	 * If there is something in the input buffer and if necessary and
3645049Sis 	 * requested, save the BOM at the output buffer.
3655049Sis 	 */
3665049Sis 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3675049Sis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3685049Sis 		    UCONV_BOM_SWAPPED_32;
3695049Sis 
3705049Sis 	/*
3715049Sis 	 * Do conversion; if encounter a surrogate pair, assemble high and
3725049Sis 	 * low pair values to form a UTF-32 character. If a half of a pair
3735049Sis 	 * exists alone, then, either it is an illegal (EILSEQ) or
3745049Sis 	 * invalid (EINVAL) value.
3755049Sis 	 */
3765049Sis 	for (; u16l < *utf16len; u16l++) {
3775049Sis 		if (u16s[u16l] == 0 && do_not_ignore_null)
3785049Sis 			break;
3795049Sis 
3805049Sis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3815049Sis 
3825049Sis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3835049Sis 			if (hi)
3845049Sis 				return (EILSEQ);
3855049Sis 			hi = lo;
3865049Sis 			continue;
3875049Sis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3885049Sis 			if (! hi)
3895049Sis 				return (EILSEQ);
3905049Sis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3915049Sis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3925049Sis 			    + UCONV_U16_START;
3935049Sis 			hi = 0;
3945049Sis 		} else if (hi) {
3955049Sis 			return (EILSEQ);
3965049Sis 		}
3975049Sis 
3985049Sis 		if (u32l >= *utf32len)
3995049Sis 			return (E2BIG);
4005049Sis 
4015049Sis 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4025049Sis 	}
4035049Sis 
4045049Sis 	/*
4055049Sis 	 * If high half didn't see low half, then, it's most likely the input
4065049Sis 	 * parameter is incomplete.
4075049Sis 	 */
4085049Sis 	if (hi)
4095049Sis 		return (EINVAL);
4105049Sis 
4115049Sis 	/*
4125049Sis 	 * Save the number of consumed and saved characters. They do not
4135049Sis 	 * include terminating NULL character (U+0000) at the end of
4145049Sis 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4155049Sis 	 * the input buffer length is big enough to include the terminating
4165049Sis 	 * NULL character).
4175049Sis 	 */
4185049Sis 	*utf16len = u16l;
4195049Sis 	*utf32len = u32l;
4205049Sis 
4215049Sis 	return (0);
4225049Sis }
4235049Sis 
4245049Sis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4255049Sis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4265049Sis     uchar_t *u8s, size_t *utf8len, int flag)
4275049Sis {
4285049Sis 	int inendian;
4295049Sis 	int outendian;
4305049Sis 	size_t u16l;
4315049Sis 	size_t u8l;
4325049Sis 	uint32_t hi;
4335049Sis 	uint32_t lo;
4345049Sis 	boolean_t do_not_ignore_null;
4355049Sis 
4365049Sis 	if (u16s == NULL || utf16len == NULL)
4375049Sis 		return (EILSEQ);
4385049Sis 
4395049Sis 	if (u8s == NULL || utf8len == NULL)
4405049Sis 		return (E2BIG);
4415049Sis 
4425049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
4435049Sis 		return (EBADF);
4445049Sis 
4455049Sis 	u16l = u8l = 0;
4465049Sis 	hi = 0;
4475049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4485049Sis 
4495049Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
4505049Sis 	    check_bom16(u16s, *utf16len, &inendian))
4515049Sis 		u16l++;
4525049Sis 
4535049Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
4545049Sis 
4555049Sis 	for (; u16l < *utf16len; u16l++) {
4565049Sis 		if (u16s[u16l] == 0 && do_not_ignore_null)
4575049Sis 			break;
4585049Sis 
4595049Sis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4605049Sis 
4615049Sis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4625049Sis 			if (hi)
4635049Sis 				return (EILSEQ);
4645049Sis 			hi = lo;
4655049Sis 			continue;
4665049Sis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4675049Sis 			if (! hi)
4685049Sis 				return (EILSEQ);
4695049Sis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4705049Sis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4715049Sis 			    + UCONV_U16_START;
4725049Sis 			hi = 0;
4735049Sis 		} else if (hi) {
4745049Sis 			return (EILSEQ);
4755049Sis 		}
4765049Sis 
4775049Sis 		/*
4785049Sis 		 * Now we convert a UTF-32 character into a UTF-8 character.
4795049Sis 		 * Unicode coding space is between U+0000 and U+10FFFF;
4805049Sis 		 * anything bigger is an illegal character.
4815049Sis 		 */
4825049Sis 		if (lo <= UCONV_U8_ONE_BYTE) {
4835049Sis 			if (u8l >= *utf8len)
4845049Sis 				return (E2BIG);
4855049Sis 			u8s[u8l++] = (uchar_t)lo;
4865049Sis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
4875049Sis 			if ((u8l + 1) >= *utf8len)
4885049Sis 				return (E2BIG);
4895049Sis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4905049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
4915049Sis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
4925049Sis 			if ((u8l + 2) >= *utf8len)
4935049Sis 				return (E2BIG);
4945049Sis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4955049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4965049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
4975049Sis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
4985049Sis 			if ((u8l + 3) >= *utf8len)
4995049Sis 				return (E2BIG);
5005049Sis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
5015049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5025049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5035049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
5045049Sis 		} else {
5055049Sis 			return (EILSEQ);
5065049Sis 		}
5075049Sis 	}
5085049Sis 
5095049Sis 	if (hi)
5105049Sis 		return (EINVAL);
5115049Sis 
5125049Sis 	*utf16len = u16l;
5135049Sis 	*utf8len = u8l;
5145049Sis 
5155049Sis 	return (0);
5165049Sis }
5175049Sis 
5185049Sis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5195049Sis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5205049Sis     uint16_t *u16s, size_t *utf16len, int flag)
5215049Sis {
5225049Sis 	int inendian;
5235049Sis 	int outendian;
5245049Sis 	size_t u16l;
5255049Sis 	size_t u32l;
5265049Sis 	uint32_t hi;
5275049Sis 	uint32_t lo;
5285049Sis 	boolean_t do_not_ignore_null;
5295049Sis 
5305049Sis 	if (u32s == NULL || utf32len == NULL)
5315049Sis 		return (EILSEQ);
5325049Sis 
5335049Sis 	if (u16s == NULL || utf16len == NULL)
5345049Sis 		return (E2BIG);
5355049Sis 
5365049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
5375049Sis 		return (EBADF);
5385049Sis 
5395049Sis 	u16l = u32l = 0;
5405049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5415049Sis 
5425049Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
5435049Sis 	    check_bom32(u32s, *utf32len, &inendian))
5445049Sis 		u32l++;
5455049Sis 
5465049Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
5475049Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
5485049Sis 
5495049Sis 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5505049Sis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5515049Sis 		    UCONV_BOM_SWAPPED;
5525049Sis 
5535049Sis 	for (; u32l < *utf32len; u32l++) {
5545049Sis 		if (u32s[u32l] == 0 && do_not_ignore_null)
5555049Sis 			break;
5565049Sis 
5575049Sis 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5585049Sis 
5595049Sis 		/*
5605049Sis 		 * Anything bigger than the Unicode coding space, i.e.,
5615049Sis 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
5625049Sis 		 * character.
5635049Sis 		 */
5645049Sis 		if (hi > UCONV_UNICODE_MAX)
5655049Sis 			return (EILSEQ);
5665049Sis 
5675049Sis 		/*
5685049Sis 		 * Anything bigger than U+FFFF must be converted into
5695049Sis 		 * a surrogate pair in UTF-16.
5705049Sis 		 */
5715049Sis 		if (hi >= UCONV_U16_START) {
5725049Sis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5735049Sis 			    UCONV_U16_LO_MIN;
5745049Sis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5755049Sis 			    UCONV_U16_HI_MIN;
5765049Sis 
5775049Sis 			if ((u16l + 1) >= *utf16len)
5785049Sis 				return (E2BIG);
5795049Sis 
5805049Sis 			if (outendian) {
5815049Sis 				u16s[u16l++] = (uint16_t)hi;
5825049Sis 				u16s[u16l++] = (uint16_t)lo;
5835049Sis 			} else {
5845049Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5855049Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5865049Sis 			}
5875049Sis 		} else {
5885049Sis 			if (u16l >= *utf16len)
5895049Sis 				return (E2BIG);
5905049Sis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
5915049Sis 			    BSWAP_16(((uint16_t)hi));
5925049Sis 		}
5935049Sis 	}
5945049Sis 
5955049Sis 	*utf16len = u16l;
5965049Sis 	*utf32len = u32l;
5975049Sis 
5985049Sis 	return (0);
5995049Sis }
6005049Sis 
6015049Sis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6025049Sis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6035049Sis     uchar_t *u8s, size_t *utf8len, int flag)
6045049Sis {
6055049Sis 	int inendian;
6065049Sis 	int outendian;
6075049Sis 	size_t u32l;
6085049Sis 	size_t u8l;
6095049Sis 	uint32_t lo;
6105049Sis 	boolean_t do_not_ignore_null;
6115049Sis 
6125049Sis 	if (u32s == NULL || utf32len == NULL)
6135049Sis 		return (EILSEQ);
6145049Sis 
6155049Sis 	if (u8s == NULL || utf8len == NULL)
6165049Sis 		return (E2BIG);
6175049Sis 
6185049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
6195049Sis 		return (EBADF);
6205049Sis 
6215049Sis 	u32l = u8l = 0;
6225049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6235049Sis 
6245049Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
6255049Sis 	    check_bom32(u32s, *utf32len, &inendian))
6265049Sis 		u32l++;
6275049Sis 
6285049Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
6295049Sis 
6305049Sis 	for (; u32l < *utf32len; u32l++) {
6315049Sis 		if (u32s[u32l] == 0 && do_not_ignore_null)
6325049Sis 			break;
6335049Sis 
6345049Sis 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6355049Sis 
6365049Sis 		if (lo <= UCONV_U8_ONE_BYTE) {
6375049Sis 			if (u8l >= *utf8len)
6385049Sis 				return (E2BIG);
6395049Sis 			u8s[u8l++] = (uchar_t)lo;
6405049Sis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
6415049Sis 			if ((u8l + 1) >= *utf8len)
6425049Sis 				return (E2BIG);
6435049Sis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6445049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
6455049Sis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
6465049Sis 			if ((u8l + 2) >= *utf8len)
6475049Sis 				return (E2BIG);
6485049Sis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6495049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6505049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
6515049Sis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
6525049Sis 			if ((u8l + 3) >= *utf8len)
6535049Sis 				return (E2BIG);
6545049Sis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6555049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6565049Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6575049Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
6585049Sis 		} else {
6595049Sis 			return (EILSEQ);
6605049Sis 		}
6615049Sis 	}
6625049Sis 
6635049Sis 	*utf32len = u32l;
6645049Sis 	*utf8len = u8l;
6655049Sis 
6665049Sis 	return (0);
6675049Sis }
6685049Sis 
6695049Sis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6705049Sis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6715049Sis     uint16_t *u16s, size_t *utf16len, int flag)
6725049Sis {
6735049Sis 	int inendian;
6745049Sis 	int outendian;
6755049Sis 	size_t u16l;
6765049Sis 	size_t u8l;
6775049Sis 	uint32_t hi;
6785049Sis 	uint32_t lo;
6795049Sis 	int remaining_bytes;
6805049Sis 	int first_b;
6815049Sis 	boolean_t do_not_ignore_null;
6825049Sis 
6835049Sis 	if (u8s == NULL || utf8len == NULL)
6845049Sis 		return (EILSEQ);
6855049Sis 
6865049Sis 	if (u16s == NULL || utf16len == NULL)
6875049Sis 		return (E2BIG);
6885049Sis 
6895049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
6905049Sis 		return (EBADF);
6915049Sis 
6925049Sis 	u16l = u8l = 0;
6935049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6945049Sis 
6955049Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
6965049Sis 
6975049Sis 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6985049Sis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6995049Sis 		    UCONV_BOM_SWAPPED;
7005049Sis 
7015049Sis 	for (; u8l < *utf8len; ) {
7025049Sis 		if (u8s[u8l] == 0 && do_not_ignore_null)
7035049Sis 			break;
7045049Sis 
7055049Sis 		/*
7065049Sis 		 * Collect a UTF-8 character and convert it to a UTF-32
7075049Sis 		 * character. In doing so, we screen out illegally formed
7085049Sis 		 * UTF-8 characters and treat such as illegal characters.
7095049Sis 		 * The algorithm at below also screens out anything bigger
7105049Sis 		 * than the U+10FFFF.
7115049Sis 		 *
7125049Sis 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7135049Sis 		 * more details on the illegal values of UTF-8 character
7145049Sis 		 * bytes.
7155049Sis 		 */
7165049Sis 		hi = (uint32_t)u8s[u8l++];
7175049Sis 
7185049Sis 		if (hi > UCONV_ASCII_MAX) {
7195049Sis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7205049Sis 				return (EILSEQ);
7215049Sis 
7225049Sis 			first_b = hi;
723*6008Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
7245049Sis 
7255049Sis 			for (; remaining_bytes > 0; remaining_bytes--) {
7265049Sis 				/*
7275049Sis 				 * If we have no more bytes, the current
7285049Sis 				 * UTF-8 character is incomplete.
7295049Sis 				 */
7305049Sis 				if (u8l >= *utf8len)
7315049Sis 					return (EINVAL);
7325049Sis 
7335049Sis 				lo = (uint32_t)u8s[u8l++];
7345049Sis 
7355049Sis 				if (first_b) {
7365049Sis 					if (lo < valid_min_2nd_byte[first_b] ||
7375049Sis 					    lo > valid_max_2nd_byte[first_b])
7385049Sis 						return (EILSEQ);
7395049Sis 					first_b = 0;
7405049Sis 				} else if (lo < UCONV_U8_BYTE_MIN ||
7415049Sis 				    lo > UCONV_U8_BYTE_MAX) {
7425049Sis 					return (EILSEQ);
7435049Sis 				}
7445049Sis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
7455049Sis 				    (lo & UCONV_U8_BIT_MASK);
7465049Sis 			}
7475049Sis 		}
7485049Sis 
7495049Sis 		if (hi >= UCONV_U16_START) {
7505049Sis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7515049Sis 			    UCONV_U16_LO_MIN;
7525049Sis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7535049Sis 			    UCONV_U16_HI_MIN;
7545049Sis 
7555049Sis 			if ((u16l + 1) >= *utf16len)
7565049Sis 				return (E2BIG);
7575049Sis 
7585049Sis 			if (outendian) {
7595049Sis 				u16s[u16l++] = (uint16_t)hi;
7605049Sis 				u16s[u16l++] = (uint16_t)lo;
7615049Sis 			} else {
7625049Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7635049Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7645049Sis 			}
7655049Sis 		} else {
7665049Sis 			if (u16l >= *utf16len)
7675049Sis 				return (E2BIG);
7685049Sis 
7695049Sis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
7705049Sis 			    BSWAP_16(((uint16_t)hi));
7715049Sis 		}
7725049Sis 	}
7735049Sis 
7745049Sis 	*utf16len = u16l;
7755049Sis 	*utf8len = u8l;
7765049Sis 
7775049Sis 	return (0);
7785049Sis }
7795049Sis 
7805049Sis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7815049Sis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7825049Sis     uint32_t *u32s, size_t *utf32len, int flag)
7835049Sis {
7845049Sis 	int inendian;
7855049Sis 	int outendian;
7865049Sis 	size_t u32l;
7875049Sis 	size_t u8l;
7885049Sis 	uint32_t hi;
7895049Sis 	uint32_t c;
7905049Sis 	int remaining_bytes;
7915049Sis 	int first_b;
7925049Sis 	boolean_t do_not_ignore_null;
7935049Sis 
7945049Sis 	if (u8s == NULL || utf8len == NULL)
7955049Sis 		return (EILSEQ);
7965049Sis 
7975049Sis 	if (u32s == NULL || utf32len == NULL)
7985049Sis 		return (E2BIG);
7995049Sis 
8005049Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
8015049Sis 		return (EBADF);
8025049Sis 
8035049Sis 	u32l = u8l = 0;
8045049Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8055049Sis 
8065049Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
8075049Sis 
8085049Sis 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8095049Sis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8105049Sis 		    UCONV_BOM_SWAPPED_32;
8115049Sis 
8125049Sis 	for (; u8l < *utf8len; ) {
8135049Sis 		if (u8s[u8l] == 0 && do_not_ignore_null)
8145049Sis 			break;
8155049Sis 
8165049Sis 		hi = (uint32_t)u8s[u8l++];
8175049Sis 
8185049Sis 		if (hi > UCONV_ASCII_MAX) {
8195049Sis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8205049Sis 				return (EILSEQ);
8215049Sis 
8225049Sis 			first_b = hi;
823*6008Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
8245049Sis 
8255049Sis 			for (; remaining_bytes > 0; remaining_bytes--) {
8265049Sis 				if (u8l >= *utf8len)
8275049Sis 					return (EINVAL);
8285049Sis 
8295049Sis 				c = (uint32_t)u8s[u8l++];
8305049Sis 
8315049Sis 				if (first_b) {
8325049Sis 					if (c < valid_min_2nd_byte[first_b] ||
8335049Sis 					    c > valid_max_2nd_byte[first_b])
8345049Sis 						return (EILSEQ);
8355049Sis 					first_b = 0;
8365049Sis 				} else if (c < UCONV_U8_BYTE_MIN ||
8375049Sis 				    c > UCONV_U8_BYTE_MAX) {
8385049Sis 					return (EILSEQ);
8395049Sis 				}
8405049Sis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
8415049Sis 				    (c & UCONV_U8_BIT_MASK);
8425049Sis 			}
8435049Sis 		}
8445049Sis 
8455049Sis 		if (u32l >= *utf32len)
8465049Sis 			return (E2BIG);
8475049Sis 
8485049Sis 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8495049Sis 	}
8505049Sis 
8515049Sis 	*utf32len = u32l;
8525049Sis 	*utf8len = u8l;
8535049Sis 
8545049Sis 	return (0);
8555049Sis }
856