15049Sis /* 25049Sis * CDDL HEADER START 35049Sis * 45049Sis * The contents of this file are subject to the terms of the 55049Sis * Common Development and Distribution License (the "License"). 65049Sis * You may not use this file except in compliance with the License. 75049Sis * 85049Sis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 95049Sis * or http://www.opensolaris.org/os/licensing. 105049Sis * See the License for the specific language governing permissions 115049Sis * and limitations under the License. 125049Sis * 135049Sis * When distributing Covered Code, include this CDDL HEADER in each 145049Sis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 155049Sis * If applicable, add the following below this CDDL HEADER, with the 165049Sis * fields enclosed by brackets "[]" replaced with your own identifying 175049Sis * information: Portions Copyright [yyyy] [name of copyright owner] 185049Sis * 195049Sis * CDDL HEADER END 205049Sis */ 215049Sis /* 22*6008Syy154373 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 235049Sis * Use is subject to license terms. 245049Sis */ 255049Sis 265049Sis #pragma ident "%Z%%M% %I% %E% SMI" 275049Sis 285049Sis /* 295049Sis * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. 305049Sis * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) 315049Sis * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), 325049Sis * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also 335049Sis * the section 3C man pages. 345049Sis * Interface stability: Committed 355049Sis */ 365049Sis 375049Sis #include <sys/types.h> 385049Sis #ifdef _KERNEL 395049Sis #include <sys/param.h> 405049Sis #include <sys/sysmacros.h> 415049Sis #include <sys/systm.h> 425049Sis #include <sys/debug.h> 435049Sis #include <sys/kmem.h> 445049Sis #include <sys/sunddi.h> 455049Sis #else 465049Sis #include <sys/u8_textprep.h> 475049Sis #endif /* _KERNEL */ 485049Sis #include <sys/byteorder.h> 495049Sis #include <sys/errno.h> 505049Sis 515049Sis 525049Sis /* 535049Sis * The max and min values of high and low surrogate pairs of UTF-16, 545049Sis * UTF-16 bit shift value, bit mask, and starting value outside of BMP. 555049Sis */ 565049Sis #define UCONV_U16_HI_MIN (0xd800U) 575049Sis #define UCONV_U16_HI_MAX (0xdbffU) 585049Sis #define UCONV_U16_LO_MIN (0xdc00U) 595049Sis #define UCONV_U16_LO_MAX (0xdfffU) 605049Sis #define UCONV_U16_BIT_SHIFT (0x0400U) 615049Sis #define UCONV_U16_BIT_MASK (0x0fffffU) 625049Sis #define UCONV_U16_START (0x010000U) 635049Sis 645049Sis /* The maximum value of Unicode coding space and ASCII coding space. */ 655049Sis #define UCONV_UNICODE_MAX (0x10ffffU) 665049Sis #define UCONV_ASCII_MAX (0x7fU) 675049Sis 685049Sis /* The mask values for input and output endians. */ 695049Sis #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) 705049Sis #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) 715049Sis 725049Sis /* Native and reversed endian macros. */ 735049Sis #ifdef _BIG_ENDIAN 745049Sis #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN 755049Sis #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN 765049Sis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN 775049Sis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN 785049Sis #else 795049Sis #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN 805049Sis #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN 815049Sis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN 825049Sis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN 835049Sis #endif /* _BIG_ENDIAN */ 845049Sis 855049Sis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ 865049Sis #define UCONV_BOM_NORMAL (0xfeffU) 875049Sis #define UCONV_BOM_SWAPPED (0xfffeU) 885049Sis #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) 895049Sis 905049Sis /* UTF-32 boundaries based on UTF-8 character byte lengths. */ 915049Sis #define UCONV_U8_ONE_BYTE (0x7fU) 925049Sis #define UCONV_U8_TWO_BYTES (0x7ffU) 935049Sis #define UCONV_U8_THREE_BYTES (0xffffU) 945049Sis #define UCONV_U8_FOUR_BYTES (0x10ffffU) 955049Sis 965049Sis /* The common minimum and maximum values at the UTF-8 character bytes. */ 975049Sis #define UCONV_U8_BYTE_MIN (0x80U) 985049Sis #define UCONV_U8_BYTE_MAX (0xbfU) 995049Sis 1005049Sis /* 1015049Sis * The following "6" and "0x3f" came from "10xx xxxx" bit representation of 1025049Sis * UTF-8 character bytes. 1035049Sis */ 1045049Sis #define UCONV_U8_BIT_SHIFT 6 1055049Sis #define UCONV_U8_BIT_MASK 0x3f 1065049Sis 1075049Sis /* 1085049Sis * The following vector shows remaining bytes in a UTF-8 character. 1095049Sis * Index will be the first byte of the character. 1105049Sis */ 1115049Sis static const uchar_t remaining_bytes_tbl[0x100] = { 1125049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1135049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1145049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1155049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1165049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1175049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1185049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1195049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1205049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1215049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1225049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1235049Sis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1245049Sis 1255049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 1265049Sis 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1275049Sis 1285049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 1295049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1305049Sis 1315049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 1325049Sis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1335049Sis 1345049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 1355049Sis 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 1365049Sis }; 1375049Sis 1385049Sis /* 1395049Sis * The following is a vector of bit-masks to get used bits in 1405049Sis * the first byte of a UTF-8 character. Index is remaining bytes at above of 1415049Sis * the character. 1425049Sis */ 143*6008Syy154373 #ifdef _KERNEL 144*6008Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 145*6008Syy154373 #else 146*6008Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 147*6008Syy154373 #endif /* _KERNEL */ 1485049Sis 1495049Sis /* 1505049Sis * The following two vectors are to provide valid minimum and 1515049Sis * maximum values for the 2'nd byte of a multibyte UTF-8 character for 1525049Sis * better illegal sequence checking. The index value must be the value of 1535049Sis * the first byte of the UTF-8 character. 1545049Sis */ 1555049Sis static const uchar_t valid_min_2nd_byte[0x100] = { 1565049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1575049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1585049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1595049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1605049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1615049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1625049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1635049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1645049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1655049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1665049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1675049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1685049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1695049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1705049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1715049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1725049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1735049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1745049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1755049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1765049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1775049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1785049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1795049Sis 0, 0, 0, 0, 0, 0, 0, 0, 1805049Sis 1815049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 1825049Sis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1835049Sis 1845049Sis /* C8 C9 CA CB CC CD CE CF */ 1855049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1865049Sis 1875049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 1885049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1895049Sis 1905049Sis /* D8 D9 DA DB DC DD DE DF */ 1915049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1925049Sis 1935049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 1945049Sis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1955049Sis 1965049Sis /* E8 E9 EA EB EC ED EE EF */ 1975049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1985049Sis 1995049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 2005049Sis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 2015049Sis 2025049Sis 0, 0, 0, 0, 0, 0, 0, 0 2035049Sis }; 2045049Sis 2055049Sis static const uchar_t valid_max_2nd_byte[0x100] = { 2065049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2075049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2085049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2095049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2105049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2115049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2125049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2135049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2145049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2155049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2165049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2175049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2185049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2195049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2205049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2215049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2225049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2235049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2245049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2255049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2265049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2275049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2285049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2295049Sis 0, 0, 0, 0, 0, 0, 0, 0, 2305049Sis 2315049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 2325049Sis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 2335049Sis 2345049Sis /* C8 C9 CA CB CC CD CE CF */ 2355049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 2365049Sis 2375049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 2385049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 2395049Sis 2405049Sis /* D8 D9 DA DB DC DD DE DF */ 2415049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 2425049Sis 2435049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 2445049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 2455049Sis 2465049Sis /* E8 E9 EA EB EC ED EE EF */ 2475049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 2485049Sis 2495049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 2505049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 2515049Sis 2525049Sis 0, 0, 0, 0, 0, 0, 0, 0 2535049Sis }; 2545049Sis 2555049Sis 2565049Sis static int 2575049Sis check_endian(int flag, int *in, int *out) 2585049Sis { 2595049Sis *in = flag & UCONV_IN_ENDIAN_MASKS; 2605049Sis 2615049Sis /* You cannot have both. */ 2625049Sis if (*in == UCONV_IN_ENDIAN_MASKS) 2635049Sis return (EBADF); 2645049Sis 2655049Sis if (*in == 0) 2665049Sis *in = UCONV_IN_NAT_ENDIAN; 2675049Sis 2685049Sis *out = flag & UCONV_OUT_ENDIAN_MASKS; 2695049Sis 2705049Sis /* You cannot have both. */ 2715049Sis if (*out == UCONV_OUT_ENDIAN_MASKS) 2725049Sis return (EBADF); 2735049Sis 2745049Sis if (*out == 0) 2755049Sis *out = UCONV_OUT_NAT_ENDIAN; 2765049Sis 2775049Sis return (0); 2785049Sis } 2795049Sis 2805049Sis static boolean_t 2815049Sis check_bom16(const uint16_t *u16s, size_t u16l, int *in) 2825049Sis { 2835049Sis if (u16l > 0) { 2845049Sis if (*u16s == UCONV_BOM_NORMAL) { 2855049Sis *in = UCONV_IN_NAT_ENDIAN; 2865049Sis return (B_TRUE); 2875049Sis } 2885049Sis if (*u16s == UCONV_BOM_SWAPPED) { 2895049Sis *in = UCONV_IN_REV_ENDIAN; 2905049Sis return (B_TRUE); 2915049Sis } 2925049Sis } 2935049Sis 2945049Sis return (B_FALSE); 2955049Sis } 2965049Sis 2975049Sis static boolean_t 2985049Sis check_bom32(const uint32_t *u32s, size_t u32l, int *in) 2995049Sis { 3005049Sis if (u32l > 0) { 3015049Sis if (*u32s == UCONV_BOM_NORMAL) { 3025049Sis *in = UCONV_IN_NAT_ENDIAN; 3035049Sis return (B_TRUE); 3045049Sis } 3055049Sis if (*u32s == UCONV_BOM_SWAPPED_32) { 3065049Sis *in = UCONV_IN_REV_ENDIAN; 3075049Sis return (B_TRUE); 3085049Sis } 3095049Sis } 3105049Sis 3115049Sis return (B_FALSE); 3125049Sis } 3135049Sis 3145049Sis int 3155049Sis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, 3165049Sis uint32_t *u32s, size_t *utf32len, int flag) 3175049Sis { 3185049Sis int inendian; 3195049Sis int outendian; 3205049Sis size_t u16l; 3215049Sis size_t u32l; 3225049Sis uint32_t hi; 3235049Sis uint32_t lo; 3245049Sis boolean_t do_not_ignore_null; 3255049Sis 3265049Sis /* 3275049Sis * Do preliminary validity checks on parameters and collect info on 3285049Sis * endians. 3295049Sis */ 3305049Sis if (u16s == NULL || utf16len == NULL) 3315049Sis return (EILSEQ); 3325049Sis 3335049Sis if (u32s == NULL || utf32len == NULL) 3345049Sis return (E2BIG); 3355049Sis 3365049Sis if (check_endian(flag, &inendian, &outendian) != 0) 3375049Sis return (EBADF); 3385049Sis 3395049Sis /* 3405049Sis * Initialize input and output parameter buffer indices and 3415049Sis * temporary variables. 3425049Sis */ 3435049Sis u16l = u32l = 0; 3445049Sis hi = 0; 3455049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 3465049Sis 3475049Sis /* 3485049Sis * Check on the BOM at the beginning of the input buffer if required 3495049Sis * and if there is indeed one, process it. 3505049Sis */ 3515049Sis if ((flag & UCONV_IN_ACCEPT_BOM) && 3525049Sis check_bom16(u16s, *utf16len, &inendian)) 3535049Sis u16l++; 3545049Sis 3555049Sis /* 3565049Sis * Reset inendian and outendian so that after this point, those can be 3575049Sis * used as condition values. 3585049Sis */ 3595049Sis inendian &= UCONV_IN_NAT_ENDIAN; 3605049Sis outendian &= UCONV_OUT_NAT_ENDIAN; 3615049Sis 3625049Sis /* 3635049Sis * If there is something in the input buffer and if necessary and 3645049Sis * requested, save the BOM at the output buffer. 3655049Sis */ 3665049Sis if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 3675049Sis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 3685049Sis UCONV_BOM_SWAPPED_32; 3695049Sis 3705049Sis /* 3715049Sis * Do conversion; if encounter a surrogate pair, assemble high and 3725049Sis * low pair values to form a UTF-32 character. If a half of a pair 3735049Sis * exists alone, then, either it is an illegal (EILSEQ) or 3745049Sis * invalid (EINVAL) value. 3755049Sis */ 3765049Sis for (; u16l < *utf16len; u16l++) { 3775049Sis if (u16s[u16l] == 0 && do_not_ignore_null) 3785049Sis break; 3795049Sis 3805049Sis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 3815049Sis 3825049Sis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 3835049Sis if (hi) 3845049Sis return (EILSEQ); 3855049Sis hi = lo; 3865049Sis continue; 3875049Sis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 3885049Sis if (! hi) 3895049Sis return (EILSEQ); 3905049Sis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 3915049Sis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 3925049Sis + UCONV_U16_START; 3935049Sis hi = 0; 3945049Sis } else if (hi) { 3955049Sis return (EILSEQ); 3965049Sis } 3975049Sis 3985049Sis if (u32l >= *utf32len) 3995049Sis return (E2BIG); 4005049Sis 4015049Sis u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); 4025049Sis } 4035049Sis 4045049Sis /* 4055049Sis * If high half didn't see low half, then, it's most likely the input 4065049Sis * parameter is incomplete. 4075049Sis */ 4085049Sis if (hi) 4095049Sis return (EINVAL); 4105049Sis 4115049Sis /* 4125049Sis * Save the number of consumed and saved characters. They do not 4135049Sis * include terminating NULL character (U+0000) at the end of 4145049Sis * the input buffer (even when UCONV_IGNORE_NULL isn't specified and 4155049Sis * the input buffer length is big enough to include the terminating 4165049Sis * NULL character). 4175049Sis */ 4185049Sis *utf16len = u16l; 4195049Sis *utf32len = u32l; 4205049Sis 4215049Sis return (0); 4225049Sis } 4235049Sis 4245049Sis int 4255049Sis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, 4265049Sis uchar_t *u8s, size_t *utf8len, int flag) 4275049Sis { 4285049Sis int inendian; 4295049Sis int outendian; 4305049Sis size_t u16l; 4315049Sis size_t u8l; 4325049Sis uint32_t hi; 4335049Sis uint32_t lo; 4345049Sis boolean_t do_not_ignore_null; 4355049Sis 4365049Sis if (u16s == NULL || utf16len == NULL) 4375049Sis return (EILSEQ); 4385049Sis 4395049Sis if (u8s == NULL || utf8len == NULL) 4405049Sis return (E2BIG); 4415049Sis 4425049Sis if (check_endian(flag, &inendian, &outendian) != 0) 4435049Sis return (EBADF); 4445049Sis 4455049Sis u16l = u8l = 0; 4465049Sis hi = 0; 4475049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 4485049Sis 4495049Sis if ((flag & UCONV_IN_ACCEPT_BOM) && 4505049Sis check_bom16(u16s, *utf16len, &inendian)) 4515049Sis u16l++; 4525049Sis 4535049Sis inendian &= UCONV_IN_NAT_ENDIAN; 4545049Sis 4555049Sis for (; u16l < *utf16len; u16l++) { 4565049Sis if (u16s[u16l] == 0 && do_not_ignore_null) 4575049Sis break; 4585049Sis 4595049Sis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 4605049Sis 4615049Sis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 4625049Sis if (hi) 4635049Sis return (EILSEQ); 4645049Sis hi = lo; 4655049Sis continue; 4665049Sis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 4675049Sis if (! hi) 4685049Sis return (EILSEQ); 4695049Sis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 4705049Sis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 4715049Sis + UCONV_U16_START; 4725049Sis hi = 0; 4735049Sis } else if (hi) { 4745049Sis return (EILSEQ); 4755049Sis } 4765049Sis 4775049Sis /* 4785049Sis * Now we convert a UTF-32 character into a UTF-8 character. 4795049Sis * Unicode coding space is between U+0000 and U+10FFFF; 4805049Sis * anything bigger is an illegal character. 4815049Sis */ 4825049Sis if (lo <= UCONV_U8_ONE_BYTE) { 4835049Sis if (u8l >= *utf8len) 4845049Sis return (E2BIG); 4855049Sis u8s[u8l++] = (uchar_t)lo; 4865049Sis } else if (lo <= UCONV_U8_TWO_BYTES) { 4875049Sis if ((u8l + 1) >= *utf8len) 4885049Sis return (E2BIG); 4895049Sis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 4905049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 4915049Sis } else if (lo <= UCONV_U8_THREE_BYTES) { 4925049Sis if ((u8l + 2) >= *utf8len) 4935049Sis return (E2BIG); 4945049Sis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 4955049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 4965049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 4975049Sis } else if (lo <= UCONV_U8_FOUR_BYTES) { 4985049Sis if ((u8l + 3) >= *utf8len) 4995049Sis return (E2BIG); 5005049Sis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 5015049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 5025049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 5035049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 5045049Sis } else { 5055049Sis return (EILSEQ); 5065049Sis } 5075049Sis } 5085049Sis 5095049Sis if (hi) 5105049Sis return (EINVAL); 5115049Sis 5125049Sis *utf16len = u16l; 5135049Sis *utf8len = u8l; 5145049Sis 5155049Sis return (0); 5165049Sis } 5175049Sis 5185049Sis int 5195049Sis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, 5205049Sis uint16_t *u16s, size_t *utf16len, int flag) 5215049Sis { 5225049Sis int inendian; 5235049Sis int outendian; 5245049Sis size_t u16l; 5255049Sis size_t u32l; 5265049Sis uint32_t hi; 5275049Sis uint32_t lo; 5285049Sis boolean_t do_not_ignore_null; 5295049Sis 5305049Sis if (u32s == NULL || utf32len == NULL) 5315049Sis return (EILSEQ); 5325049Sis 5335049Sis if (u16s == NULL || utf16len == NULL) 5345049Sis return (E2BIG); 5355049Sis 5365049Sis if (check_endian(flag, &inendian, &outendian) != 0) 5375049Sis return (EBADF); 5385049Sis 5395049Sis u16l = u32l = 0; 5405049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 5415049Sis 5425049Sis if ((flag & UCONV_IN_ACCEPT_BOM) && 5435049Sis check_bom32(u32s, *utf32len, &inendian)) 5445049Sis u32l++; 5455049Sis 5465049Sis inendian &= UCONV_IN_NAT_ENDIAN; 5475049Sis outendian &= UCONV_OUT_NAT_ENDIAN; 5485049Sis 5495049Sis if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 5505049Sis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 5515049Sis UCONV_BOM_SWAPPED; 5525049Sis 5535049Sis for (; u32l < *utf32len; u32l++) { 5545049Sis if (u32s[u32l] == 0 && do_not_ignore_null) 5555049Sis break; 5565049Sis 5575049Sis hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 5585049Sis 5595049Sis /* 5605049Sis * Anything bigger than the Unicode coding space, i.e., 5615049Sis * Unicode scalar value bigger than U+10FFFF, is an illegal 5625049Sis * character. 5635049Sis */ 5645049Sis if (hi > UCONV_UNICODE_MAX) 5655049Sis return (EILSEQ); 5665049Sis 5675049Sis /* 5685049Sis * Anything bigger than U+FFFF must be converted into 5695049Sis * a surrogate pair in UTF-16. 5705049Sis */ 5715049Sis if (hi >= UCONV_U16_START) { 5725049Sis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 5735049Sis UCONV_U16_LO_MIN; 5745049Sis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 5755049Sis UCONV_U16_HI_MIN; 5765049Sis 5775049Sis if ((u16l + 1) >= *utf16len) 5785049Sis return (E2BIG); 5795049Sis 5805049Sis if (outendian) { 5815049Sis u16s[u16l++] = (uint16_t)hi; 5825049Sis u16s[u16l++] = (uint16_t)lo; 5835049Sis } else { 5845049Sis u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 5855049Sis u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 5865049Sis } 5875049Sis } else { 5885049Sis if (u16l >= *utf16len) 5895049Sis return (E2BIG); 5905049Sis u16s[u16l++] = (outendian) ? (uint16_t)hi : 5915049Sis BSWAP_16(((uint16_t)hi)); 5925049Sis } 5935049Sis } 5945049Sis 5955049Sis *utf16len = u16l; 5965049Sis *utf32len = u32l; 5975049Sis 5985049Sis return (0); 5995049Sis } 6005049Sis 6015049Sis int 6025049Sis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, 6035049Sis uchar_t *u8s, size_t *utf8len, int flag) 6045049Sis { 6055049Sis int inendian; 6065049Sis int outendian; 6075049Sis size_t u32l; 6085049Sis size_t u8l; 6095049Sis uint32_t lo; 6105049Sis boolean_t do_not_ignore_null; 6115049Sis 6125049Sis if (u32s == NULL || utf32len == NULL) 6135049Sis return (EILSEQ); 6145049Sis 6155049Sis if (u8s == NULL || utf8len == NULL) 6165049Sis return (E2BIG); 6175049Sis 6185049Sis if (check_endian(flag, &inendian, &outendian) != 0) 6195049Sis return (EBADF); 6205049Sis 6215049Sis u32l = u8l = 0; 6225049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 6235049Sis 6245049Sis if ((flag & UCONV_IN_ACCEPT_BOM) && 6255049Sis check_bom32(u32s, *utf32len, &inendian)) 6265049Sis u32l++; 6275049Sis 6285049Sis inendian &= UCONV_IN_NAT_ENDIAN; 6295049Sis 6305049Sis for (; u32l < *utf32len; u32l++) { 6315049Sis if (u32s[u32l] == 0 && do_not_ignore_null) 6325049Sis break; 6335049Sis 6345049Sis lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 6355049Sis 6365049Sis if (lo <= UCONV_U8_ONE_BYTE) { 6375049Sis if (u8l >= *utf8len) 6385049Sis return (E2BIG); 6395049Sis u8s[u8l++] = (uchar_t)lo; 6405049Sis } else if (lo <= UCONV_U8_TWO_BYTES) { 6415049Sis if ((u8l + 1) >= *utf8len) 6425049Sis return (E2BIG); 6435049Sis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 6445049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 6455049Sis } else if (lo <= UCONV_U8_THREE_BYTES) { 6465049Sis if ((u8l + 2) >= *utf8len) 6475049Sis return (E2BIG); 6485049Sis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 6495049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 6505049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 6515049Sis } else if (lo <= UCONV_U8_FOUR_BYTES) { 6525049Sis if ((u8l + 3) >= *utf8len) 6535049Sis return (E2BIG); 6545049Sis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 6555049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 6565049Sis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 6575049Sis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 6585049Sis } else { 6595049Sis return (EILSEQ); 6605049Sis } 6615049Sis } 6625049Sis 6635049Sis *utf32len = u32l; 6645049Sis *utf8len = u8l; 6655049Sis 6665049Sis return (0); 6675049Sis } 6685049Sis 6695049Sis int 6705049Sis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, 6715049Sis uint16_t *u16s, size_t *utf16len, int flag) 6725049Sis { 6735049Sis int inendian; 6745049Sis int outendian; 6755049Sis size_t u16l; 6765049Sis size_t u8l; 6775049Sis uint32_t hi; 6785049Sis uint32_t lo; 6795049Sis int remaining_bytes; 6805049Sis int first_b; 6815049Sis boolean_t do_not_ignore_null; 6825049Sis 6835049Sis if (u8s == NULL || utf8len == NULL) 6845049Sis return (EILSEQ); 6855049Sis 6865049Sis if (u16s == NULL || utf16len == NULL) 6875049Sis return (E2BIG); 6885049Sis 6895049Sis if (check_endian(flag, &inendian, &outendian) != 0) 6905049Sis return (EBADF); 6915049Sis 6925049Sis u16l = u8l = 0; 6935049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 6945049Sis 6955049Sis outendian &= UCONV_OUT_NAT_ENDIAN; 6965049Sis 6975049Sis if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 6985049Sis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 6995049Sis UCONV_BOM_SWAPPED; 7005049Sis 7015049Sis for (; u8l < *utf8len; ) { 7025049Sis if (u8s[u8l] == 0 && do_not_ignore_null) 7035049Sis break; 7045049Sis 7055049Sis /* 7065049Sis * Collect a UTF-8 character and convert it to a UTF-32 7075049Sis * character. In doing so, we screen out illegally formed 7085049Sis * UTF-8 characters and treat such as illegal characters. 7095049Sis * The algorithm at below also screens out anything bigger 7105049Sis * than the U+10FFFF. 7115049Sis * 7125049Sis * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for 7135049Sis * more details on the illegal values of UTF-8 character 7145049Sis * bytes. 7155049Sis */ 7165049Sis hi = (uint32_t)u8s[u8l++]; 7175049Sis 7185049Sis if (hi > UCONV_ASCII_MAX) { 7195049Sis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 7205049Sis return (EILSEQ); 7215049Sis 7225049Sis first_b = hi; 723*6008Syy154373 hi = hi & u8_masks_tbl[remaining_bytes]; 7245049Sis 7255049Sis for (; remaining_bytes > 0; remaining_bytes--) { 7265049Sis /* 7275049Sis * If we have no more bytes, the current 7285049Sis * UTF-8 character is incomplete. 7295049Sis */ 7305049Sis if (u8l >= *utf8len) 7315049Sis return (EINVAL); 7325049Sis 7335049Sis lo = (uint32_t)u8s[u8l++]; 7345049Sis 7355049Sis if (first_b) { 7365049Sis if (lo < valid_min_2nd_byte[first_b] || 7375049Sis lo > valid_max_2nd_byte[first_b]) 7385049Sis return (EILSEQ); 7395049Sis first_b = 0; 7405049Sis } else if (lo < UCONV_U8_BYTE_MIN || 7415049Sis lo > UCONV_U8_BYTE_MAX) { 7425049Sis return (EILSEQ); 7435049Sis } 7445049Sis hi = (hi << UCONV_U8_BIT_SHIFT) | 7455049Sis (lo & UCONV_U8_BIT_MASK); 7465049Sis } 7475049Sis } 7485049Sis 7495049Sis if (hi >= UCONV_U16_START) { 7505049Sis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 7515049Sis UCONV_U16_LO_MIN; 7525049Sis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 7535049Sis UCONV_U16_HI_MIN; 7545049Sis 7555049Sis if ((u16l + 1) >= *utf16len) 7565049Sis return (E2BIG); 7575049Sis 7585049Sis if (outendian) { 7595049Sis u16s[u16l++] = (uint16_t)hi; 7605049Sis u16s[u16l++] = (uint16_t)lo; 7615049Sis } else { 7625049Sis u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 7635049Sis u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 7645049Sis } 7655049Sis } else { 7665049Sis if (u16l >= *utf16len) 7675049Sis return (E2BIG); 7685049Sis 7695049Sis u16s[u16l++] = (outendian) ? (uint16_t)hi : 7705049Sis BSWAP_16(((uint16_t)hi)); 7715049Sis } 7725049Sis } 7735049Sis 7745049Sis *utf16len = u16l; 7755049Sis *utf8len = u8l; 7765049Sis 7775049Sis return (0); 7785049Sis } 7795049Sis 7805049Sis int 7815049Sis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, 7825049Sis uint32_t *u32s, size_t *utf32len, int flag) 7835049Sis { 7845049Sis int inendian; 7855049Sis int outendian; 7865049Sis size_t u32l; 7875049Sis size_t u8l; 7885049Sis uint32_t hi; 7895049Sis uint32_t c; 7905049Sis int remaining_bytes; 7915049Sis int first_b; 7925049Sis boolean_t do_not_ignore_null; 7935049Sis 7945049Sis if (u8s == NULL || utf8len == NULL) 7955049Sis return (EILSEQ); 7965049Sis 7975049Sis if (u32s == NULL || utf32len == NULL) 7985049Sis return (E2BIG); 7995049Sis 8005049Sis if (check_endian(flag, &inendian, &outendian) != 0) 8015049Sis return (EBADF); 8025049Sis 8035049Sis u32l = u8l = 0; 8045049Sis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 8055049Sis 8065049Sis outendian &= UCONV_OUT_NAT_ENDIAN; 8075049Sis 8085049Sis if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 8095049Sis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 8105049Sis UCONV_BOM_SWAPPED_32; 8115049Sis 8125049Sis for (; u8l < *utf8len; ) { 8135049Sis if (u8s[u8l] == 0 && do_not_ignore_null) 8145049Sis break; 8155049Sis 8165049Sis hi = (uint32_t)u8s[u8l++]; 8175049Sis 8185049Sis if (hi > UCONV_ASCII_MAX) { 8195049Sis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 8205049Sis return (EILSEQ); 8215049Sis 8225049Sis first_b = hi; 823*6008Syy154373 hi = hi & u8_masks_tbl[remaining_bytes]; 8245049Sis 8255049Sis for (; remaining_bytes > 0; remaining_bytes--) { 8265049Sis if (u8l >= *utf8len) 8275049Sis return (EINVAL); 8285049Sis 8295049Sis c = (uint32_t)u8s[u8l++]; 8305049Sis 8315049Sis if (first_b) { 8325049Sis if (c < valid_min_2nd_byte[first_b] || 8335049Sis c > valid_max_2nd_byte[first_b]) 8345049Sis return (EILSEQ); 8355049Sis first_b = 0; 8365049Sis } else if (c < UCONV_U8_BYTE_MIN || 8375049Sis c > UCONV_U8_BYTE_MAX) { 8385049Sis return (EILSEQ); 8395049Sis } 8405049Sis hi = (hi << UCONV_U8_BIT_SHIFT) | 8415049Sis (c & UCONV_U8_BIT_MASK); 8425049Sis } 8435049Sis } 8445049Sis 8455049Sis if (u32l >= *utf32len) 8465049Sis return (E2BIG); 8475049Sis 8485049Sis u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); 8495049Sis } 8505049Sis 8515049Sis *utf32len = u32l; 8525049Sis *utf8len = u8l; 8535049Sis 8545049Sis return (0); 8555049Sis } 856