15049Sis /*
25049Sis * CDDL HEADER START
35049Sis *
45049Sis * The contents of this file are subject to the terms of the
55049Sis * Common Development and Distribution License (the "License").
65049Sis * You may not use this file except in compliance with the License.
75049Sis *
85049Sis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95049Sis * or http://www.opensolaris.org/os/licensing.
105049Sis * See the License for the specific language governing permissions
115049Sis * and limitations under the License.
125049Sis *
135049Sis * When distributing Covered Code, include this CDDL HEADER in each
145049Sis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155049Sis * If applicable, add the following below this CDDL HEADER, with the
165049Sis * fields enclosed by brackets "[]" replaced with your own identifying
175049Sis * information: Portions Copyright [yyyy] [name of copyright owner]
185049Sis *
195049Sis * CDDL HEADER END
205049Sis */
215049Sis /*
22*6008Syy154373 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
235049Sis * Use is subject to license terms.
245049Sis */
255049Sis
265049Sis #pragma ident "%Z%%M% %I% %E% SMI"
275049Sis
285049Sis /*
295049Sis * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
305049Sis * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
315049Sis * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
325049Sis * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
335049Sis * the section 3C man pages.
345049Sis * Interface stability: Committed
355049Sis */
365049Sis
375049Sis #include <sys/types.h>
385049Sis #ifdef _KERNEL
395049Sis #include <sys/param.h>
405049Sis #include <sys/sysmacros.h>
415049Sis #include <sys/systm.h>
425049Sis #include <sys/debug.h>
435049Sis #include <sys/kmem.h>
445049Sis #include <sys/sunddi.h>
455049Sis #else
465049Sis #include <sys/u8_textprep.h>
475049Sis #endif /* _KERNEL */
485049Sis #include <sys/byteorder.h>
495049Sis #include <sys/errno.h>
505049Sis
515049Sis
525049Sis /*
535049Sis * The max and min values of high and low surrogate pairs of UTF-16,
545049Sis * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
555049Sis */
565049Sis #define UCONV_U16_HI_MIN (0xd800U)
575049Sis #define UCONV_U16_HI_MAX (0xdbffU)
585049Sis #define UCONV_U16_LO_MIN (0xdc00U)
595049Sis #define UCONV_U16_LO_MAX (0xdfffU)
605049Sis #define UCONV_U16_BIT_SHIFT (0x0400U)
615049Sis #define UCONV_U16_BIT_MASK (0x0fffffU)
625049Sis #define UCONV_U16_START (0x010000U)
635049Sis
645049Sis /* The maximum value of Unicode coding space and ASCII coding space. */
655049Sis #define UCONV_UNICODE_MAX (0x10ffffU)
665049Sis #define UCONV_ASCII_MAX (0x7fU)
675049Sis
685049Sis /* The mask values for input and output endians. */
695049Sis #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
705049Sis #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
715049Sis
725049Sis /* Native and reversed endian macros. */
735049Sis #ifdef _BIG_ENDIAN
745049Sis #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
755049Sis #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
765049Sis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
775049Sis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
785049Sis #else
795049Sis #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
805049Sis #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
815049Sis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
825049Sis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
835049Sis #endif /* _BIG_ENDIAN */
845049Sis
855049Sis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
865049Sis #define UCONV_BOM_NORMAL (0xfeffU)
875049Sis #define UCONV_BOM_SWAPPED (0xfffeU)
885049Sis #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
895049Sis
905049Sis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
915049Sis #define UCONV_U8_ONE_BYTE (0x7fU)
925049Sis #define UCONV_U8_TWO_BYTES (0x7ffU)
935049Sis #define UCONV_U8_THREE_BYTES (0xffffU)
945049Sis #define UCONV_U8_FOUR_BYTES (0x10ffffU)
955049Sis
965049Sis /* The common minimum and maximum values at the UTF-8 character bytes. */
975049Sis #define UCONV_U8_BYTE_MIN (0x80U)
985049Sis #define UCONV_U8_BYTE_MAX (0xbfU)
995049Sis
1005049Sis /*
1015049Sis * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1025049Sis * UTF-8 character bytes.
1035049Sis */
1045049Sis #define UCONV_U8_BIT_SHIFT 6
1055049Sis #define UCONV_U8_BIT_MASK 0x3f
1065049Sis
1075049Sis /*
1085049Sis * The following vector shows remaining bytes in a UTF-8 character.
1095049Sis * Index will be the first byte of the character.
1105049Sis */
1115049Sis static const uchar_t remaining_bytes_tbl[0x100] = {
1125049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1135049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1145049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1155049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1165049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1175049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1185049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1205049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1215049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1225049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245049Sis
1255049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
1265049Sis 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1275049Sis
1285049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
1295049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1305049Sis
1315049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
1325049Sis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1335049Sis
1345049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
1355049Sis 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1365049Sis };
1375049Sis
1385049Sis /*
1395049Sis * The following is a vector of bit-masks to get used bits in
1405049Sis * the first byte of a UTF-8 character. Index is remaining bytes at above of
1415049Sis * the character.
1425049Sis */
143*6008Syy154373 #ifdef _KERNEL
144*6008Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*6008Syy154373 #else
146*6008Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147*6008Syy154373 #endif /* _KERNEL */
1485049Sis
1495049Sis /*
1505049Sis * The following two vectors are to provide valid minimum and
1515049Sis * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1525049Sis * better illegal sequence checking. The index value must be the value of
1535049Sis * the first byte of the UTF-8 character.
1545049Sis */
1555049Sis static const uchar_t valid_min_2nd_byte[0x100] = {
1565049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1575049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1585049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1595049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1605049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1615049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1625049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1635049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1645049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1655049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1665049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1675049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1685049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1695049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1705049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1715049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1725049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1735049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1745049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1755049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1765049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1775049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1785049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1795049Sis 0, 0, 0, 0, 0, 0, 0, 0,
1805049Sis
1815049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */
1825049Sis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1835049Sis
1845049Sis /* C8 C9 CA CB CC CD CE CF */
1855049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1865049Sis
1875049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */
1885049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1895049Sis
1905049Sis /* D8 D9 DA DB DC DD DE DF */
1915049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1925049Sis
1935049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */
1945049Sis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1955049Sis
1965049Sis /* E8 E9 EA EB EC ED EE EF */
1975049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1985049Sis
1995049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2005049Sis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
2015049Sis
2025049Sis 0, 0, 0, 0, 0, 0, 0, 0
2035049Sis };
2045049Sis
2055049Sis static const uchar_t valid_max_2nd_byte[0x100] = {
2065049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2075049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2085049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2095049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2105049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2115049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2125049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2135049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2145049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2155049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2165049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2175049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2185049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2195049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2205049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2215049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2225049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2235049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2245049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2255049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2265049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2275049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2285049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2295049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2305049Sis
2315049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2325049Sis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2335049Sis
2345049Sis /* C8 C9 CA CB CC CD CE CF */
2355049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2365049Sis
2375049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2385049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2395049Sis
2405049Sis /* D8 D9 DA DB DC DD DE DF */
2415049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2425049Sis
2435049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2445049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2455049Sis
2465049Sis /* E8 E9 EA EB EC ED EE EF */
2475049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2485049Sis
2495049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2505049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
2515049Sis
2525049Sis 0, 0, 0, 0, 0, 0, 0, 0
2535049Sis };
2545049Sis
2555049Sis
2565049Sis static int
check_endian(int flag,int * in,int * out)2575049Sis check_endian(int flag, int *in, int *out)
2585049Sis {
2595049Sis *in = flag & UCONV_IN_ENDIAN_MASKS;
2605049Sis
2615049Sis /* You cannot have both. */
2625049Sis if (*in == UCONV_IN_ENDIAN_MASKS)
2635049Sis return (EBADF);
2645049Sis
2655049Sis if (*in == 0)
2665049Sis *in = UCONV_IN_NAT_ENDIAN;
2675049Sis
2685049Sis *out = flag & UCONV_OUT_ENDIAN_MASKS;
2695049Sis
2705049Sis /* You cannot have both. */
2715049Sis if (*out == UCONV_OUT_ENDIAN_MASKS)
2725049Sis return (EBADF);
2735049Sis
2745049Sis if (*out == 0)
2755049Sis *out = UCONV_OUT_NAT_ENDIAN;
2765049Sis
2775049Sis return (0);
2785049Sis }
2795049Sis
2805049Sis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2815049Sis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2825049Sis {
2835049Sis if (u16l > 0) {
2845049Sis if (*u16s == UCONV_BOM_NORMAL) {
2855049Sis *in = UCONV_IN_NAT_ENDIAN;
2865049Sis return (B_TRUE);
2875049Sis }
2885049Sis if (*u16s == UCONV_BOM_SWAPPED) {
2895049Sis *in = UCONV_IN_REV_ENDIAN;
2905049Sis return (B_TRUE);
2915049Sis }
2925049Sis }
2935049Sis
2945049Sis return (B_FALSE);
2955049Sis }
2965049Sis
2975049Sis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2985049Sis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2995049Sis {
3005049Sis if (u32l > 0) {
3015049Sis if (*u32s == UCONV_BOM_NORMAL) {
3025049Sis *in = UCONV_IN_NAT_ENDIAN;
3035049Sis return (B_TRUE);
3045049Sis }
3055049Sis if (*u32s == UCONV_BOM_SWAPPED_32) {
3065049Sis *in = UCONV_IN_REV_ENDIAN;
3075049Sis return (B_TRUE);
3085049Sis }
3095049Sis }
3105049Sis
3115049Sis return (B_FALSE);
3125049Sis }
3135049Sis
3145049Sis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3155049Sis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3165049Sis uint32_t *u32s, size_t *utf32len, int flag)
3175049Sis {
3185049Sis int inendian;
3195049Sis int outendian;
3205049Sis size_t u16l;
3215049Sis size_t u32l;
3225049Sis uint32_t hi;
3235049Sis uint32_t lo;
3245049Sis boolean_t do_not_ignore_null;
3255049Sis
3265049Sis /*
3275049Sis * Do preliminary validity checks on parameters and collect info on
3285049Sis * endians.
3295049Sis */
3305049Sis if (u16s == NULL || utf16len == NULL)
3315049Sis return (EILSEQ);
3325049Sis
3335049Sis if (u32s == NULL || utf32len == NULL)
3345049Sis return (E2BIG);
3355049Sis
3365049Sis if (check_endian(flag, &inendian, &outendian) != 0)
3375049Sis return (EBADF);
3385049Sis
3395049Sis /*
3405049Sis * Initialize input and output parameter buffer indices and
3415049Sis * temporary variables.
3425049Sis */
3435049Sis u16l = u32l = 0;
3445049Sis hi = 0;
3455049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3465049Sis
3475049Sis /*
3485049Sis * Check on the BOM at the beginning of the input buffer if required
3495049Sis * and if there is indeed one, process it.
3505049Sis */
3515049Sis if ((flag & UCONV_IN_ACCEPT_BOM) &&
3525049Sis check_bom16(u16s, *utf16len, &inendian))
3535049Sis u16l++;
3545049Sis
3555049Sis /*
3565049Sis * Reset inendian and outendian so that after this point, those can be
3575049Sis * used as condition values.
3585049Sis */
3595049Sis inendian &= UCONV_IN_NAT_ENDIAN;
3605049Sis outendian &= UCONV_OUT_NAT_ENDIAN;
3615049Sis
3625049Sis /*
3635049Sis * If there is something in the input buffer and if necessary and
3645049Sis * requested, save the BOM at the output buffer.
3655049Sis */
3665049Sis if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3675049Sis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3685049Sis UCONV_BOM_SWAPPED_32;
3695049Sis
3705049Sis /*
3715049Sis * Do conversion; if encounter a surrogate pair, assemble high and
3725049Sis * low pair values to form a UTF-32 character. If a half of a pair
3735049Sis * exists alone, then, either it is an illegal (EILSEQ) or
3745049Sis * invalid (EINVAL) value.
3755049Sis */
3765049Sis for (; u16l < *utf16len; u16l++) {
3775049Sis if (u16s[u16l] == 0 && do_not_ignore_null)
3785049Sis break;
3795049Sis
3805049Sis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3815049Sis
3825049Sis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3835049Sis if (hi)
3845049Sis return (EILSEQ);
3855049Sis hi = lo;
3865049Sis continue;
3875049Sis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3885049Sis if (! hi)
3895049Sis return (EILSEQ);
3905049Sis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3915049Sis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3925049Sis + UCONV_U16_START;
3935049Sis hi = 0;
3945049Sis } else if (hi) {
3955049Sis return (EILSEQ);
3965049Sis }
3975049Sis
3985049Sis if (u32l >= *utf32len)
3995049Sis return (E2BIG);
4005049Sis
4015049Sis u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4025049Sis }
4035049Sis
4045049Sis /*
4055049Sis * If high half didn't see low half, then, it's most likely the input
4065049Sis * parameter is incomplete.
4075049Sis */
4085049Sis if (hi)
4095049Sis return (EINVAL);
4105049Sis
4115049Sis /*
4125049Sis * Save the number of consumed and saved characters. They do not
4135049Sis * include terminating NULL character (U+0000) at the end of
4145049Sis * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4155049Sis * the input buffer length is big enough to include the terminating
4165049Sis * NULL character).
4175049Sis */
4185049Sis *utf16len = u16l;
4195049Sis *utf32len = u32l;
4205049Sis
4215049Sis return (0);
4225049Sis }
4235049Sis
4245049Sis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4255049Sis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4265049Sis uchar_t *u8s, size_t *utf8len, int flag)
4275049Sis {
4285049Sis int inendian;
4295049Sis int outendian;
4305049Sis size_t u16l;
4315049Sis size_t u8l;
4325049Sis uint32_t hi;
4335049Sis uint32_t lo;
4345049Sis boolean_t do_not_ignore_null;
4355049Sis
4365049Sis if (u16s == NULL || utf16len == NULL)
4375049Sis return (EILSEQ);
4385049Sis
4395049Sis if (u8s == NULL || utf8len == NULL)
4405049Sis return (E2BIG);
4415049Sis
4425049Sis if (check_endian(flag, &inendian, &outendian) != 0)
4435049Sis return (EBADF);
4445049Sis
4455049Sis u16l = u8l = 0;
4465049Sis hi = 0;
4475049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4485049Sis
4495049Sis if ((flag & UCONV_IN_ACCEPT_BOM) &&
4505049Sis check_bom16(u16s, *utf16len, &inendian))
4515049Sis u16l++;
4525049Sis
4535049Sis inendian &= UCONV_IN_NAT_ENDIAN;
4545049Sis
4555049Sis for (; u16l < *utf16len; u16l++) {
4565049Sis if (u16s[u16l] == 0 && do_not_ignore_null)
4575049Sis break;
4585049Sis
4595049Sis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4605049Sis
4615049Sis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4625049Sis if (hi)
4635049Sis return (EILSEQ);
4645049Sis hi = lo;
4655049Sis continue;
4665049Sis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4675049Sis if (! hi)
4685049Sis return (EILSEQ);
4695049Sis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4705049Sis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4715049Sis + UCONV_U16_START;
4725049Sis hi = 0;
4735049Sis } else if (hi) {
4745049Sis return (EILSEQ);
4755049Sis }
4765049Sis
4775049Sis /*
4785049Sis * Now we convert a UTF-32 character into a UTF-8 character.
4795049Sis * Unicode coding space is between U+0000 and U+10FFFF;
4805049Sis * anything bigger is an illegal character.
4815049Sis */
4825049Sis if (lo <= UCONV_U8_ONE_BYTE) {
4835049Sis if (u8l >= *utf8len)
4845049Sis return (E2BIG);
4855049Sis u8s[u8l++] = (uchar_t)lo;
4865049Sis } else if (lo <= UCONV_U8_TWO_BYTES) {
4875049Sis if ((u8l + 1) >= *utf8len)
4885049Sis return (E2BIG);
4895049Sis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4905049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
4915049Sis } else if (lo <= UCONV_U8_THREE_BYTES) {
4925049Sis if ((u8l + 2) >= *utf8len)
4935049Sis return (E2BIG);
4945049Sis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4955049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4965049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
4975049Sis } else if (lo <= UCONV_U8_FOUR_BYTES) {
4985049Sis if ((u8l + 3) >= *utf8len)
4995049Sis return (E2BIG);
5005049Sis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
5015049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5025049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5035049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
5045049Sis } else {
5055049Sis return (EILSEQ);
5065049Sis }
5075049Sis }
5085049Sis
5095049Sis if (hi)
5105049Sis return (EINVAL);
5115049Sis
5125049Sis *utf16len = u16l;
5135049Sis *utf8len = u8l;
5145049Sis
5155049Sis return (0);
5165049Sis }
5175049Sis
5185049Sis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5195049Sis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5205049Sis uint16_t *u16s, size_t *utf16len, int flag)
5215049Sis {
5225049Sis int inendian;
5235049Sis int outendian;
5245049Sis size_t u16l;
5255049Sis size_t u32l;
5265049Sis uint32_t hi;
5275049Sis uint32_t lo;
5285049Sis boolean_t do_not_ignore_null;
5295049Sis
5305049Sis if (u32s == NULL || utf32len == NULL)
5315049Sis return (EILSEQ);
5325049Sis
5335049Sis if (u16s == NULL || utf16len == NULL)
5345049Sis return (E2BIG);
5355049Sis
5365049Sis if (check_endian(flag, &inendian, &outendian) != 0)
5375049Sis return (EBADF);
5385049Sis
5395049Sis u16l = u32l = 0;
5405049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5415049Sis
5425049Sis if ((flag & UCONV_IN_ACCEPT_BOM) &&
5435049Sis check_bom32(u32s, *utf32len, &inendian))
5445049Sis u32l++;
5455049Sis
5465049Sis inendian &= UCONV_IN_NAT_ENDIAN;
5475049Sis outendian &= UCONV_OUT_NAT_ENDIAN;
5485049Sis
5495049Sis if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5505049Sis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5515049Sis UCONV_BOM_SWAPPED;
5525049Sis
5535049Sis for (; u32l < *utf32len; u32l++) {
5545049Sis if (u32s[u32l] == 0 && do_not_ignore_null)
5555049Sis break;
5565049Sis
5575049Sis hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5585049Sis
5595049Sis /*
5605049Sis * Anything bigger than the Unicode coding space, i.e.,
5615049Sis * Unicode scalar value bigger than U+10FFFF, is an illegal
5625049Sis * character.
5635049Sis */
5645049Sis if (hi > UCONV_UNICODE_MAX)
5655049Sis return (EILSEQ);
5665049Sis
5675049Sis /*
5685049Sis * Anything bigger than U+FFFF must be converted into
5695049Sis * a surrogate pair in UTF-16.
5705049Sis */
5715049Sis if (hi >= UCONV_U16_START) {
5725049Sis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5735049Sis UCONV_U16_LO_MIN;
5745049Sis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5755049Sis UCONV_U16_HI_MIN;
5765049Sis
5775049Sis if ((u16l + 1) >= *utf16len)
5785049Sis return (E2BIG);
5795049Sis
5805049Sis if (outendian) {
5815049Sis u16s[u16l++] = (uint16_t)hi;
5825049Sis u16s[u16l++] = (uint16_t)lo;
5835049Sis } else {
5845049Sis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5855049Sis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5865049Sis }
5875049Sis } else {
5885049Sis if (u16l >= *utf16len)
5895049Sis return (E2BIG);
5905049Sis u16s[u16l++] = (outendian) ? (uint16_t)hi :
5915049Sis BSWAP_16(((uint16_t)hi));
5925049Sis }
5935049Sis }
5945049Sis
5955049Sis *utf16len = u16l;
5965049Sis *utf32len = u32l;
5975049Sis
5985049Sis return (0);
5995049Sis }
6005049Sis
6015049Sis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6025049Sis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6035049Sis uchar_t *u8s, size_t *utf8len, int flag)
6045049Sis {
6055049Sis int inendian;
6065049Sis int outendian;
6075049Sis size_t u32l;
6085049Sis size_t u8l;
6095049Sis uint32_t lo;
6105049Sis boolean_t do_not_ignore_null;
6115049Sis
6125049Sis if (u32s == NULL || utf32len == NULL)
6135049Sis return (EILSEQ);
6145049Sis
6155049Sis if (u8s == NULL || utf8len == NULL)
6165049Sis return (E2BIG);
6175049Sis
6185049Sis if (check_endian(flag, &inendian, &outendian) != 0)
6195049Sis return (EBADF);
6205049Sis
6215049Sis u32l = u8l = 0;
6225049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6235049Sis
6245049Sis if ((flag & UCONV_IN_ACCEPT_BOM) &&
6255049Sis check_bom32(u32s, *utf32len, &inendian))
6265049Sis u32l++;
6275049Sis
6285049Sis inendian &= UCONV_IN_NAT_ENDIAN;
6295049Sis
6305049Sis for (; u32l < *utf32len; u32l++) {
6315049Sis if (u32s[u32l] == 0 && do_not_ignore_null)
6325049Sis break;
6335049Sis
6345049Sis lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6355049Sis
6365049Sis if (lo <= UCONV_U8_ONE_BYTE) {
6375049Sis if (u8l >= *utf8len)
6385049Sis return (E2BIG);
6395049Sis u8s[u8l++] = (uchar_t)lo;
6405049Sis } else if (lo <= UCONV_U8_TWO_BYTES) {
6415049Sis if ((u8l + 1) >= *utf8len)
6425049Sis return (E2BIG);
6435049Sis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6445049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
6455049Sis } else if (lo <= UCONV_U8_THREE_BYTES) {
6465049Sis if ((u8l + 2) >= *utf8len)
6475049Sis return (E2BIG);
6485049Sis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6495049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6505049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
6515049Sis } else if (lo <= UCONV_U8_FOUR_BYTES) {
6525049Sis if ((u8l + 3) >= *utf8len)
6535049Sis return (E2BIG);
6545049Sis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6555049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6565049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6575049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
6585049Sis } else {
6595049Sis return (EILSEQ);
6605049Sis }
6615049Sis }
6625049Sis
6635049Sis *utf32len = u32l;
6645049Sis *utf8len = u8l;
6655049Sis
6665049Sis return (0);
6675049Sis }
6685049Sis
6695049Sis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6705049Sis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6715049Sis uint16_t *u16s, size_t *utf16len, int flag)
6725049Sis {
6735049Sis int inendian;
6745049Sis int outendian;
6755049Sis size_t u16l;
6765049Sis size_t u8l;
6775049Sis uint32_t hi;
6785049Sis uint32_t lo;
6795049Sis int remaining_bytes;
6805049Sis int first_b;
6815049Sis boolean_t do_not_ignore_null;
6825049Sis
6835049Sis if (u8s == NULL || utf8len == NULL)
6845049Sis return (EILSEQ);
6855049Sis
6865049Sis if (u16s == NULL || utf16len == NULL)
6875049Sis return (E2BIG);
6885049Sis
6895049Sis if (check_endian(flag, &inendian, &outendian) != 0)
6905049Sis return (EBADF);
6915049Sis
6925049Sis u16l = u8l = 0;
6935049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6945049Sis
6955049Sis outendian &= UCONV_OUT_NAT_ENDIAN;
6965049Sis
6975049Sis if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6985049Sis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6995049Sis UCONV_BOM_SWAPPED;
7005049Sis
7015049Sis for (; u8l < *utf8len; ) {
7025049Sis if (u8s[u8l] == 0 && do_not_ignore_null)
7035049Sis break;
7045049Sis
7055049Sis /*
7065049Sis * Collect a UTF-8 character and convert it to a UTF-32
7075049Sis * character. In doing so, we screen out illegally formed
7085049Sis * UTF-8 characters and treat such as illegal characters.
7095049Sis * The algorithm at below also screens out anything bigger
7105049Sis * than the U+10FFFF.
7115049Sis *
7125049Sis * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7135049Sis * more details on the illegal values of UTF-8 character
7145049Sis * bytes.
7155049Sis */
7165049Sis hi = (uint32_t)u8s[u8l++];
7175049Sis
7185049Sis if (hi > UCONV_ASCII_MAX) {
7195049Sis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7205049Sis return (EILSEQ);
7215049Sis
7225049Sis first_b = hi;
723*6008Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
7245049Sis
7255049Sis for (; remaining_bytes > 0; remaining_bytes--) {
7265049Sis /*
7275049Sis * If we have no more bytes, the current
7285049Sis * UTF-8 character is incomplete.
7295049Sis */
7305049Sis if (u8l >= *utf8len)
7315049Sis return (EINVAL);
7325049Sis
7335049Sis lo = (uint32_t)u8s[u8l++];
7345049Sis
7355049Sis if (first_b) {
7365049Sis if (lo < valid_min_2nd_byte[first_b] ||
7375049Sis lo > valid_max_2nd_byte[first_b])
7385049Sis return (EILSEQ);
7395049Sis first_b = 0;
7405049Sis } else if (lo < UCONV_U8_BYTE_MIN ||
7415049Sis lo > UCONV_U8_BYTE_MAX) {
7425049Sis return (EILSEQ);
7435049Sis }
7445049Sis hi = (hi << UCONV_U8_BIT_SHIFT) |
7455049Sis (lo & UCONV_U8_BIT_MASK);
7465049Sis }
7475049Sis }
7485049Sis
7495049Sis if (hi >= UCONV_U16_START) {
7505049Sis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7515049Sis UCONV_U16_LO_MIN;
7525049Sis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7535049Sis UCONV_U16_HI_MIN;
7545049Sis
7555049Sis if ((u16l + 1) >= *utf16len)
7565049Sis return (E2BIG);
7575049Sis
7585049Sis if (outendian) {
7595049Sis u16s[u16l++] = (uint16_t)hi;
7605049Sis u16s[u16l++] = (uint16_t)lo;
7615049Sis } else {
7625049Sis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7635049Sis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7645049Sis }
7655049Sis } else {
7665049Sis if (u16l >= *utf16len)
7675049Sis return (E2BIG);
7685049Sis
7695049Sis u16s[u16l++] = (outendian) ? (uint16_t)hi :
7705049Sis BSWAP_16(((uint16_t)hi));
7715049Sis }
7725049Sis }
7735049Sis
7745049Sis *utf16len = u16l;
7755049Sis *utf8len = u8l;
7765049Sis
7775049Sis return (0);
7785049Sis }
7795049Sis
7805049Sis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7815049Sis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7825049Sis uint32_t *u32s, size_t *utf32len, int flag)
7835049Sis {
7845049Sis int inendian;
7855049Sis int outendian;
7865049Sis size_t u32l;
7875049Sis size_t u8l;
7885049Sis uint32_t hi;
7895049Sis uint32_t c;
7905049Sis int remaining_bytes;
7915049Sis int first_b;
7925049Sis boolean_t do_not_ignore_null;
7935049Sis
7945049Sis if (u8s == NULL || utf8len == NULL)
7955049Sis return (EILSEQ);
7965049Sis
7975049Sis if (u32s == NULL || utf32len == NULL)
7985049Sis return (E2BIG);
7995049Sis
8005049Sis if (check_endian(flag, &inendian, &outendian) != 0)
8015049Sis return (EBADF);
8025049Sis
8035049Sis u32l = u8l = 0;
8045049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8055049Sis
8065049Sis outendian &= UCONV_OUT_NAT_ENDIAN;
8075049Sis
8085049Sis if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8095049Sis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8105049Sis UCONV_BOM_SWAPPED_32;
8115049Sis
8125049Sis for (; u8l < *utf8len; ) {
8135049Sis if (u8s[u8l] == 0 && do_not_ignore_null)
8145049Sis break;
8155049Sis
8165049Sis hi = (uint32_t)u8s[u8l++];
8175049Sis
8185049Sis if (hi > UCONV_ASCII_MAX) {
8195049Sis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8205049Sis return (EILSEQ);
8215049Sis
8225049Sis first_b = hi;
823*6008Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
8245049Sis
8255049Sis for (; remaining_bytes > 0; remaining_bytes--) {
8265049Sis if (u8l >= *utf8len)
8275049Sis return (EINVAL);
8285049Sis
8295049Sis c = (uint32_t)u8s[u8l++];
8305049Sis
8315049Sis if (first_b) {
8325049Sis if (c < valid_min_2nd_byte[first_b] ||
8335049Sis c > valid_max_2nd_byte[first_b])
8345049Sis return (EILSEQ);
8355049Sis first_b = 0;
8365049Sis } else if (c < UCONV_U8_BYTE_MIN ||
8375049Sis c > UCONV_U8_BYTE_MAX) {
8385049Sis return (EILSEQ);
8395049Sis }
8405049Sis hi = (hi << UCONV_U8_BIT_SHIFT) |
8415049Sis (c & UCONV_U8_BIT_MASK);
8425049Sis }
8435049Sis }
8445049Sis
8455049Sis if (u32l >= *utf32len)
8465049Sis return (E2BIG);
8475049Sis
8485049Sis u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8495049Sis }
8505049Sis
8515049Sis *utf32len = u32l;
8525049Sis *utf8len = u8l;
8535049Sis
8545049Sis return (0);
8555049Sis }
856