xref: /onnv-gate/usr/src/common/unicode/u8_textprep.c (revision 7012:932cd76b5633)
15049Sis /*
25049Sis  * CDDL HEADER START
35049Sis  *
45049Sis  * The contents of this file are subject to the terms of the
55049Sis  * Common Development and Distribution License (the "License").
65049Sis  * You may not use this file except in compliance with the License.
75049Sis  *
85049Sis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95049Sis  * or http://www.opensolaris.org/os/licensing.
105049Sis  * See the License for the specific language governing permissions
115049Sis  * and limitations under the License.
125049Sis  *
135049Sis  * When distributing Covered Code, include this CDDL HEADER in each
145049Sis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155049Sis  * If applicable, add the following below this CDDL HEADER, with the
165049Sis  * fields enclosed by brackets "[]" replaced with your own identifying
175049Sis  * information: Portions Copyright [yyyy] [name of copyright owner]
185049Sis  *
195049Sis  * CDDL HEADER END
205049Sis  */
215049Sis /*
22*7012Sis  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
235049Sis  * Use is subject to license terms.
245049Sis  */
255049Sis 
265049Sis #pragma ident	"%Z%%M%	%I%	%E% SMI"
275049Sis 
285049Sis 
295049Sis /*
305049Sis  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
315049Sis  *
325049Sis  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
335049Sis  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
345049Sis  * the section 3C man pages.
355049Sis  * Interface stability: Committed.
365049Sis  */
375049Sis 
385049Sis #include <sys/types.h>
395049Sis #ifdef	_KERNEL
405049Sis #include <sys/param.h>
415049Sis #include <sys/sysmacros.h>
425049Sis #include <sys/systm.h>
435049Sis #include <sys/debug.h>
445049Sis #include <sys/kmem.h>
455049Sis #include <sys/ddi.h>
465049Sis #include <sys/sunddi.h>
475049Sis #else
485049Sis #include <sys/u8_textprep.h>
495049Sis #include <strings.h>
505049Sis #endif	/* _KERNEL */
515049Sis #include <sys/byteorder.h>
525049Sis #include <sys/errno.h>
535049Sis #include <sys/u8_textprep_data.h>
545049Sis 
555049Sis 
565049Sis /* The maximum possible number of bytes in a UTF-8 character. */
575049Sis #define	U8_MB_CUR_MAX			(4)
585049Sis 
595049Sis /*
605049Sis  * The maximum number of bytes needed for a UTF-8 character to cover
615049Sis  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
625049Sis  */
635049Sis #define	U8_MAX_BYTES_UCS2		(3)
645049Sis 
655049Sis /* The maximum possible number of bytes in a Stream-Safe Text. */
665049Sis #define	U8_STREAM_SAFE_TEXT_MAX		(128)
675049Sis 
685049Sis /*
695049Sis  * The maximum number of characters in a combining/conjoining sequence and
705049Sis  * the actual upperbound limit of a combining/conjoining sequence.
715049Sis  */
725049Sis #define	U8_MAX_CHARS_A_SEQ		(32)
735049Sis #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
745049Sis 
755049Sis /* The combining class value for Starter. */
765049Sis #define	U8_COMBINING_CLASS_STARTER	(0)
775049Sis 
785049Sis /*
795049Sis  * Some Hangul related macros at below.
805049Sis  *
815049Sis  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
825049Sis  * Vowels, and optional Trailing consonants in Unicode scalar values.
835049Sis  *
845049Sis  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
855049Sis  * the actual U+11A8. This is due to that the trailing consonant is optional
865049Sis  * and thus we are doing a pre-calculation of subtracting one.
875049Sis  *
885049Sis  * Each of 19 modern leading consonants has total 588 possible syllables since
895049Sis  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
905049Sis  * no trailing consonant case, i.e., 21 x 28 = 588.
915049Sis  *
925049Sis  * We also have bunch of Hangul related macros at below. Please bear in mind
935049Sis  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
945049Sis  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
955049Sis  * Jamo; it just guarantee that it will be most likely.
965049Sis  */
975049Sis #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
985049Sis #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
995049Sis 
1005049Sis #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
1015049Sis #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
1025049Sis #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
1035049Sis #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
1045049Sis #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
1055049Sis #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
1065049Sis 
1075049Sis #define	U8_HANGUL_V_COUNT		(21)
1085049Sis #define	U8_HANGUL_VT_COUNT		(588)
1095049Sis #define	U8_HANGUL_T_COUNT		(28)
1105049Sis 
1115049Sis #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
1125049Sis 
1135049Sis #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1145049Sis 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1155049Sis 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1165049Sis 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1175049Sis 
1185049Sis #define	U8_HANGUL_JAMO_L(u) \
1195049Sis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1205049Sis 
1215049Sis #define	U8_HANGUL_JAMO_V(u) \
1225049Sis 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1235049Sis 
1245049Sis #define	U8_HANGUL_JAMO_T(u) \
1255049Sis 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1265049Sis 
1275049Sis #define	U8_HANGUL_JAMO(u) \
1285049Sis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1295049Sis 
1305049Sis #define	U8_HANGUL_SYLLABLE(u) \
1315049Sis 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1325049Sis 
1335049Sis #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
1345049Sis 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1355049Sis 
1365049Sis #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1375049Sis 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1385049Sis 
1395049Sis /* The types of decomposition mappings. */
1405049Sis #define	U8_DECOMP_BOTH			(0xF5U)
1415049Sis #define	U8_DECOMP_CANONICAL		(0xF6U)
1425049Sis 
1435049Sis /* The indicator for 16-bit table. */
1445049Sis #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
1455049Sis 
1465049Sis /* The following are some convenience macros. */
1475049Sis #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1485049Sis 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1495049Sis 		(uint32_t)(b3) & 0x3F;
1505049Sis 
1515049Sis #define	U8_SIMPLE_SWAP(a, b, t) \
1525049Sis 	(t) = (a); \
1535049Sis 	(a) = (b); \
1545049Sis 	(b) = (t);
1555049Sis 
1565049Sis #define	U8_ASCII_TOUPPER(c) \
1575049Sis 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1585049Sis 
1595049Sis #define	U8_ASCII_TOLOWER(c) \
1605049Sis 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1615049Sis 
1625049Sis #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
1635049Sis /*
1645049Sis  * The following macro assumes that the two characters that are to be
1655049Sis  * swapped are adjacent to each other and 'a' comes before 'b'.
1665049Sis  *
1675049Sis  * If the assumptions are not met, then, the macro will fail.
1685049Sis  */
1695049Sis #define	U8_SWAP_COMB_MARKS(a, b) \
1705049Sis 	for (k = 0; k < disp[(a)]; k++) \
1715049Sis 		u8t[k] = u8s[start[(a)] + k]; \
1725049Sis 	for (k = 0; k < disp[(b)]; k++) \
1735049Sis 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1745049Sis 	start[(b)] = start[(a)] + disp[(b)]; \
1755049Sis 	for (k = 0; k < disp[(a)]; k++) \
1765049Sis 		u8s[start[(b)] + k] = u8t[k]; \
1775049Sis 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1785049Sis 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1795049Sis 
1805049Sis /* The possible states during normalization. */
1815049Sis typedef enum {
1825049Sis 	U8_STATE_START = 0,
1835049Sis 	U8_STATE_HANGUL_L = 1,
1845049Sis 	U8_STATE_HANGUL_LV = 2,
1855049Sis 	U8_STATE_HANGUL_LVT = 3,
1865049Sis 	U8_STATE_HANGUL_V = 4,
1875049Sis 	U8_STATE_HANGUL_T = 5,
1885049Sis 	U8_STATE_COMBINING_MARK = 6
1895049Sis } u8_normalization_states_t;
1905049Sis 
1915049Sis /*
1925049Sis  * The three vectors at below are used to check bytes of a given UTF-8
1935049Sis  * character are valid and not containing any malformed byte values.
1945049Sis  *
1955049Sis  * We used to have a quite relaxed UTF-8 binary representation but then there
1965049Sis  * was some security related issues and so the Unicode Consortium defined
1975049Sis  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
1985049Sis  * one more time at the Unicode 3.2. The following three tables are based on
1995049Sis  * that.
2005049Sis  */
2015049Sis 
2025049Sis #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
2035049Sis 
2045049Sis #define	I_				U8_ILLEGAL_CHAR
2055049Sis #define	O_				U8_OUT_OF_RANGE_CHAR
2065049Sis 
2075049Sis const int8_t u8_number_of_bytes[0x100] = {
2085049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2095049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2105049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2115049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2125049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2135049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2145049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2155049Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2165049Sis 
2175049Sis /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
2185049Sis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2195049Sis 
2205049Sis /*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
2215049Sis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2225049Sis 
2235049Sis /*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
2245049Sis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2255049Sis 
2265049Sis /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
2275049Sis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2285049Sis 
2295049Sis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
2305049Sis 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2315049Sis 
2325049Sis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
2335049Sis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2345049Sis 
2355049Sis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
2365049Sis 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2375049Sis 
2385049Sis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
2395049Sis 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2405049Sis };
2415049Sis 
2425049Sis #undef	I_
2435049Sis #undef	O_
2445049Sis 
2455049Sis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2465049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2475049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2485049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2495049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2505049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2515049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2525049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2535049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2545049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2555049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2565049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2575049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2585049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2595049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2605049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2615049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2625049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2635049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2645049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2655049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2665049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2675049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2685049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2695049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2705049Sis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
2715049Sis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2725049Sis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
2735049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2745049Sis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
2755049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2765049Sis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
2775049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2785049Sis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
2795049Sis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2805049Sis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
2815049Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2825049Sis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
2835049Sis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2845049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2855049Sis };
2865049Sis 
2875049Sis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2885049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2895049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2905049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2915049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2925049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2935049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2945049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2955049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2965049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2975049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2985049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
2995049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3005049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3015049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3025049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3035049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3045049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3055049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3065049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3075049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3085049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3095049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3105049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3115049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3125049Sis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
3135049Sis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3145049Sis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
3155049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3165049Sis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
3175049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3185049Sis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
3195049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3205049Sis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
3215049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3225049Sis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
3235049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3245049Sis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
3255049Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
3265049Sis 	0,    0,    0,    0,    0,    0,    0,    0,
3275049Sis };
3285049Sis 
3295049Sis 
3305049Sis /*
3315049Sis  * The u8_validate() validates on the given UTF-8 character string and
3325049Sis  * calculate the byte length. It is quite similar to mblen(3C) except that
3335049Sis  * this will validate against the list of characters if required and
3345049Sis  * specific to UTF-8 and Unicode.
3355049Sis  */
3365049Sis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)337*7012Sis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3385049Sis {
3395049Sis 	uchar_t *ib;
3405049Sis 	uchar_t *ibtail;
3415049Sis 	uchar_t **p;
3425049Sis 	uchar_t *s1;
3435049Sis 	uchar_t *s2;
3445049Sis 	uchar_t f;
3455049Sis 	int sz;
3465049Sis 	size_t i;
3475049Sis 	int ret_val;
3485049Sis 	boolean_t second;
3495049Sis 	boolean_t no_need_to_validate_entire;
3505049Sis 	boolean_t check_additional;
3515049Sis 	boolean_t validate_ucs2_range_only;
3525049Sis 
3535049Sis 	if (! u8str)
3545049Sis 		return (0);
3555049Sis 
3565049Sis 	ib = (uchar_t *)u8str;
3575049Sis 	ibtail = ib + n;
3585049Sis 
3595049Sis 	ret_val = 0;
3605049Sis 
3615049Sis 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3625049Sis 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3635049Sis 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3645049Sis 
3655049Sis 	while (ib < ibtail) {
3665049Sis 		/*
3675049Sis 		 * The first byte of a UTF-8 character tells how many
3685049Sis 		 * bytes will follow for the character. If the first byte
3695049Sis 		 * is an illegal byte value or out of range value, we just
3705049Sis 		 * return -1 with an appropriate error number.
3715049Sis 		 */
3725049Sis 		sz = u8_number_of_bytes[*ib];
3735049Sis 		if (sz == U8_ILLEGAL_CHAR) {
374*7012Sis 			*errnum = EILSEQ;
3755049Sis 			return (-1);
3765049Sis 		}
3775049Sis 
3785049Sis 		if (sz == U8_OUT_OF_RANGE_CHAR ||
3795049Sis 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*7012Sis 			*errnum = ERANGE;
3815049Sis 			return (-1);
3825049Sis 		}
3835049Sis 
3845049Sis 		/*
3855049Sis 		 * If we don't have enough bytes to check on, that's also
3865049Sis 		 * an error. As you can see, we give illegal byte sequence
3875049Sis 		 * checking higher priority then EINVAL cases.
3885049Sis 		 */
3895049Sis 		if ((ibtail - ib) < sz) {
390*7012Sis 			*errnum = EINVAL;
3915049Sis 			return (-1);
3925049Sis 		}
3935049Sis 
3945049Sis 		if (sz == 1) {
3955049Sis 			ib++;
3965049Sis 			ret_val++;
3975049Sis 		} else {
3985049Sis 			/*
3995049Sis 			 * Check on the multi-byte UTF-8 character. For more
4005049Sis 			 * details on this, see comment added for the used
4015049Sis 			 * data structures at the beginning of the file.
4025049Sis 			 */
4035049Sis 			f = *ib++;
4045049Sis 			ret_val++;
4055049Sis 			second = B_TRUE;
4065049Sis 			for (i = 1; i < sz; i++) {
4075049Sis 				if (second) {
4085049Sis 					if (*ib < u8_valid_min_2nd_byte[f] ||
4095049Sis 					    *ib > u8_valid_max_2nd_byte[f]) {
410*7012Sis 						*errnum = EILSEQ;
4115049Sis 						return (-1);
4125049Sis 					}
4135049Sis 					second = B_FALSE;
4145049Sis 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*7012Sis 					*errnum = EILSEQ;
4165049Sis 					return (-1);
4175049Sis 				}
4185049Sis 				ib++;
4195049Sis 				ret_val++;
4205049Sis 			}
4215049Sis 		}
4225049Sis 
4235049Sis 		if (check_additional) {
4245049Sis 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4255049Sis 				s1 = ib - sz;
4265049Sis 				s2 = p[i];
4275049Sis 				while (s1 < ib) {
4285049Sis 					if (*s1 != *s2 || *s2 == '\0')
4295049Sis 						break;
4305049Sis 					s1++;
4315049Sis 					s2++;
4325049Sis 				}
4335049Sis 
4345049Sis 				if (s1 >= ib && *s2 == '\0') {
435*7012Sis 					*errnum = EBADF;
4365049Sis 					return (-1);
4375049Sis 				}
4385049Sis 			}
4395049Sis 		}
4405049Sis 
4415049Sis 		if (no_need_to_validate_entire)
4425049Sis 			break;
4435049Sis 	}
4445049Sis 
4455049Sis 	return (ret_val);
4465049Sis }
4475049Sis 
4485049Sis /*
4495049Sis  * The do_case_conv() looks at the mapping tables and returns found
4505049Sis  * bytes if any. If not found, the input bytes are returned. The function
4515049Sis  * always terminate the return bytes with a null character assuming that
4525049Sis  * there are plenty of room to do so.
4535049Sis  *
4545049Sis  * The case conversions are simple case conversions mapping a character to
4555049Sis  * another character as specified in the Unicode data. The byte size of
4565049Sis  * the mapped character could be different from that of the input character.
4575049Sis  *
4585049Sis  * The return value is the byte length of the returned character excluding
4595049Sis  * the terminating null byte.
4605049Sis  */
4615049Sis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4625049Sis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4635049Sis {
4645049Sis 	size_t i;
4655049Sis 	uint16_t b1 = 0;
4665049Sis 	uint16_t b2 = 0;
4675049Sis 	uint16_t b3 = 0;
4685049Sis 	uint16_t b3_tbl;
4695049Sis 	uint16_t b3_base;
4705049Sis 	uint16_t b4 = 0;
4715049Sis 	size_t start_id;
4725049Sis 	size_t end_id;
4735049Sis 
4745049Sis 	/*
4755049Sis 	 * At this point, the only possible values for sz are 2, 3, and 4.
4765049Sis 	 * The u8s should point to a vector that is well beyond the size of
4775049Sis 	 * 5 bytes.
4785049Sis 	 */
4795049Sis 	if (sz == 2) {
4805049Sis 		b3 = u8s[0] = s[0];
4815049Sis 		b4 = u8s[1] = s[1];
4825049Sis 	} else if (sz == 3) {
4835049Sis 		b2 = u8s[0] = s[0];
4845049Sis 		b3 = u8s[1] = s[1];
4855049Sis 		b4 = u8s[2] = s[2];
4865049Sis 	} else if (sz == 4) {
4875049Sis 		b1 = u8s[0] = s[0];
4885049Sis 		b2 = u8s[1] = s[1];
4895049Sis 		b3 = u8s[2] = s[2];
4905049Sis 		b4 = u8s[3] = s[3];
4915049Sis 	} else {
4925049Sis 		/* This is not possible but just in case as a fallback. */
4935049Sis 		if (is_it_toupper)
4945049Sis 			*u8s = U8_ASCII_TOUPPER(*s);
4955049Sis 		else
4965049Sis 			*u8s = U8_ASCII_TOLOWER(*s);
4975049Sis 		u8s[1] = '\0';
4985049Sis 
4995049Sis 		return (1);
5005049Sis 	}
5015049Sis 	u8s[sz] = '\0';
5025049Sis 
5035049Sis 	/*
5045049Sis 	 * Let's find out if we have a corresponding character.
5055049Sis 	 */
5065049Sis 	b1 = u8_common_b1_tbl[uv][b1];
5075049Sis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5085049Sis 		return ((size_t)sz);
5095049Sis 
5105049Sis 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
5115049Sis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5125049Sis 		return ((size_t)sz);
5135049Sis 
5145049Sis 	if (is_it_toupper) {
5155049Sis 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5165049Sis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5175049Sis 			return ((size_t)sz);
5185049Sis 
5195049Sis 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5205049Sis 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5215049Sis 
5225049Sis 		/* Either there is no match or an error at the table. */
5235049Sis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5245049Sis 			return ((size_t)sz);
5255049Sis 
5265049Sis 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5275049Sis 
5285049Sis 		for (i = 0; start_id < end_id; start_id++)
5295049Sis 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5305049Sis 	} else {
5315049Sis 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5325049Sis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5335049Sis 			return ((size_t)sz);
5345049Sis 
5355049Sis 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5365049Sis 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5375049Sis 
5385049Sis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5395049Sis 			return ((size_t)sz);
5405049Sis 
5415049Sis 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5425049Sis 
5435049Sis 		for (i = 0; start_id < end_id; start_id++)
5445049Sis 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5455049Sis 	}
5465049Sis 
5475049Sis 	/*
5485049Sis 	 * If i is still zero, that means there is no corresponding character.
5495049Sis 	 */
5505049Sis 	if (i == 0)
5515049Sis 		return ((size_t)sz);
5525049Sis 
5535049Sis 	u8s[i] = '\0';
5545049Sis 
5555049Sis 	return (i);
5565049Sis }
5575049Sis 
5585049Sis /*
5595049Sis  * The do_case_compare() function compares the two input strings, s1 and s2,
5605049Sis  * one character at a time doing case conversions if applicable and return
5615049Sis  * the comparison result as like strcmp().
5625049Sis  *
5635049Sis  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5645049Sis  * we treat the 7-bit ASCII characters as a special case trying to yield
5655049Sis  * faster processing time.
5665049Sis  */
5675049Sis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5685049Sis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
569*7012Sis 	size_t n2, boolean_t is_it_toupper, int *errnum)
5705049Sis {
5715049Sis 	int f;
5725049Sis 	int sz1;
5735049Sis 	int sz2;
5745049Sis 	size_t j;
5755049Sis 	size_t i1;
5765049Sis 	size_t i2;
5775049Sis 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
5785049Sis 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
5795049Sis 
5805049Sis 	i1 = i2 = 0;
5815049Sis 	while (i1 < n1 && i2 < n2) {
5825049Sis 		/*
5835049Sis 		 * Find out what would be the byte length for this UTF-8
5845049Sis 		 * character at string s1 and also find out if this is
5855049Sis 		 * an illegal start byte or not and if so, issue a proper
586*7012Sis 		 * error number and yet treat this byte as a character.
5875049Sis 		 */
5885049Sis 		sz1 = u8_number_of_bytes[*s1];
5895049Sis 		if (sz1 < 0) {
590*7012Sis 			*errnum = EILSEQ;
5915049Sis 			sz1 = 1;
5925049Sis 		}
5935049Sis 
5945049Sis 		/*
5955049Sis 		 * For 7-bit ASCII characters mainly, we do a quick case
5965049Sis 		 * conversion right at here.
5975049Sis 		 *
5985049Sis 		 * If we don't have enough bytes for this character, issue
5995049Sis 		 * an EINVAL error and use what are available.
6005049Sis 		 *
6015049Sis 		 * If we have enough bytes, find out if there is
6025049Sis 		 * a corresponding uppercase character and if so, copy over
6035049Sis 		 * the bytes for a comparison later. If there is no
6045049Sis 		 * corresponding uppercase character, then, use what we have
6055049Sis 		 * for the comparison.
6065049Sis 		 */
6075049Sis 		if (sz1 == 1) {
6085049Sis 			if (is_it_toupper)
6095049Sis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
6105049Sis 			else
6115049Sis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
6125049Sis 			s1++;
6135049Sis 			u8s1[1] = '\0';
6145049Sis 		} else if ((i1 + sz1) > n1) {
615*7012Sis 			*errnum = EINVAL;
6165049Sis 			for (j = 0; (i1 + j) < n1; )
6175049Sis 				u8s1[j++] = *s1++;
6185049Sis 			u8s1[j] = '\0';
6195049Sis 		} else {
6205049Sis 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6215049Sis 			s1 += sz1;
6225049Sis 		}
6235049Sis 
6245049Sis 		/* Do the same for the string s2. */
6255049Sis 		sz2 = u8_number_of_bytes[*s2];
6265049Sis 		if (sz2 < 0) {
627*7012Sis 			*errnum = EILSEQ;
6285049Sis 			sz2 = 1;
6295049Sis 		}
6305049Sis 
6315049Sis 		if (sz2 == 1) {
6325049Sis 			if (is_it_toupper)
6335049Sis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
6345049Sis 			else
6355049Sis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
6365049Sis 			s2++;
6375049Sis 			u8s2[1] = '\0';
6385049Sis 		} else if ((i2 + sz2) > n2) {
639*7012Sis 			*errnum = EINVAL;
6405049Sis 			for (j = 0; (i2 + j) < n2; )
6415049Sis 				u8s2[j++] = *s2++;
6425049Sis 			u8s2[j] = '\0';
6435049Sis 		} else {
6445049Sis 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6455049Sis 			s2 += sz2;
6465049Sis 		}
6475049Sis 
6485049Sis 		/* Now compare the two characters. */
6495049Sis 		if (sz1 == 1 && sz2 == 1) {
6505049Sis 			if (*u8s1 > *u8s2)
6515049Sis 				return (1);
6525049Sis 			if (*u8s1 < *u8s2)
6535049Sis 				return (-1);
6545049Sis 		} else {
6555049Sis 			f = strcmp((const char *)u8s1, (const char *)u8s2);
6565049Sis 			if (f != 0)
6575049Sis 				return (f);
6585049Sis 		}
6595049Sis 
6605049Sis 		/*
6615049Sis 		 * They were the same. Let's move on to the next
6625049Sis 		 * characters then.
6635049Sis 		 */
6645049Sis 		i1 += sz1;
6655049Sis 		i2 += sz2;
6665049Sis 	}
6675049Sis 
6685049Sis 	/*
6695049Sis 	 * We compared until the end of either or both strings.
6705049Sis 	 *
6715049Sis 	 * If we reached to or went over the ends for the both, that means
6725049Sis 	 * they are the same.
6735049Sis 	 *
6745049Sis 	 * If we reached only one of the two ends, that means the other string
6755049Sis 	 * has something which then the fact can be used to determine
6765049Sis 	 * the return value.
6775049Sis 	 */
6785049Sis 	if (i1 >= n1) {
6795049Sis 		if (i2 >= n2)
6805049Sis 			return (0);
6815049Sis 		return (-1);
6825049Sis 	}
6835049Sis 	return (1);
6845049Sis }
6855049Sis 
6865049Sis /*
6875049Sis  * The combining_class() function checks on the given bytes and find out
6885049Sis  * the corresponding Unicode combining class value. The return value 0 means
6895049Sis  * it is a Starter. Any illegal UTF-8 character will also be treated as
6905049Sis  * a Starter.
6915049Sis  */
6925049Sis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6935049Sis combining_class(size_t uv, uchar_t *s, size_t sz)
6945049Sis {
6955049Sis 	uint16_t b1 = 0;
6965049Sis 	uint16_t b2 = 0;
6975049Sis 	uint16_t b3 = 0;
6985049Sis 	uint16_t b4 = 0;
6995049Sis 
7005049Sis 	if (sz == 1 || sz > 4)
7015049Sis 		return (0);
7025049Sis 
7035049Sis 	if (sz == 2) {
7045049Sis 		b3 = s[0];
7055049Sis 		b4 = s[1];
7065049Sis 	} else if (sz == 3) {
7075049Sis 		b2 = s[0];
7085049Sis 		b3 = s[1];
7095049Sis 		b4 = s[2];
7105049Sis 	} else if (sz == 4) {
7115049Sis 		b1 = s[0];
7125049Sis 		b2 = s[1];
7135049Sis 		b3 = s[2];
7145049Sis 		b4 = s[3];
7155049Sis 	}
7165049Sis 
7175049Sis 	b1 = u8_common_b1_tbl[uv][b1];
7185049Sis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7195049Sis 		return (0);
7205049Sis 
7215049Sis 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7225049Sis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7235049Sis 		return (0);
7245049Sis 
7255049Sis 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7265049Sis 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7275049Sis 		return (0);
7285049Sis 
7295049Sis 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
7305049Sis }
7315049Sis 
7325049Sis /*
7335049Sis  * The do_decomp() function finds out a matching decomposition if any
7345049Sis  * and return. If there is no match, the input bytes are copied and returned.
7355049Sis  * The function also checks if there is a Hangul, decomposes it if necessary
7365049Sis  * and returns.
7375049Sis  *
7385049Sis  * To save time, a single byte 7-bit ASCII character should be handled by
7395049Sis  * the caller.
7405049Sis  *
7415049Sis  * The function returns the number of bytes returned sans always terminating
7425049Sis  * the null byte. It will also return a state that will tell if there was
7435049Sis  * a Hangul character decomposed which then will be used by the caller.
7445049Sis  */
7455049Sis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7465049Sis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7475049Sis 	boolean_t canonical_decomposition, u8_normalization_states_t *state)
7485049Sis {
7495049Sis 	uint16_t b1 = 0;
7505049Sis 	uint16_t b2 = 0;
7515049Sis 	uint16_t b3 = 0;
7525049Sis 	uint16_t b3_tbl;
7535049Sis 	uint16_t b3_base;
7545049Sis 	uint16_t b4 = 0;
7555049Sis 	size_t start_id;
7565049Sis 	size_t end_id;
7575049Sis 	size_t i;
7585049Sis 	uint32_t u1;
7595049Sis 
7605049Sis 	if (sz == 2) {
7615049Sis 		b3 = u8s[0] = s[0];
7625049Sis 		b4 = u8s[1] = s[1];
7635049Sis 		u8s[2] = '\0';
7645049Sis 	} else if (sz == 3) {
7655049Sis 		/* Convert it to a Unicode scalar value. */
7665049Sis 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7675049Sis 
7685049Sis 		/*
7695049Sis 		 * If this is a Hangul syllable, we decompose it into
7705049Sis 		 * a leading consonant, a vowel, and an optional trailing
7715049Sis 		 * consonant and then return.
7725049Sis 		 */
7735049Sis 		if (U8_HANGUL_SYLLABLE(u1)) {
7745049Sis 			u1 -= U8_HANGUL_SYL_FIRST;
7755049Sis 
7765049Sis 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7775049Sis 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7785049Sis 			    / U8_HANGUL_T_COUNT;
7795049Sis 			b3 = u1 % U8_HANGUL_T_COUNT;
7805049Sis 
7815049Sis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7825049Sis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7835049Sis 			if (b3) {
7845049Sis 				b3 += U8_HANGUL_JAMO_T_FIRST;
7855049Sis 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7865049Sis 
7875049Sis 				u8s[9] = '\0';
7885049Sis 				*state = U8_STATE_HANGUL_LVT;
7895049Sis 				return (9);
7905049Sis 			}
7915049Sis 
7925049Sis 			u8s[6] = '\0';
7935049Sis 			*state = U8_STATE_HANGUL_LV;
7945049Sis 			return (6);
7955049Sis 		}
7965049Sis 
7975049Sis 		b2 = u8s[0] = s[0];
7985049Sis 		b3 = u8s[1] = s[1];
7995049Sis 		b4 = u8s[2] = s[2];
8005049Sis 		u8s[3] = '\0';
8015049Sis 
8025049Sis 		/*
8035049Sis 		 * If this is a Hangul Jamo, we know there is nothing
8045049Sis 		 * further that we can decompose.
8055049Sis 		 */
8065049Sis 		if (U8_HANGUL_JAMO_L(u1)) {
8075049Sis 			*state = U8_STATE_HANGUL_L;
8085049Sis 			return (3);
8095049Sis 		}
8105049Sis 
8115049Sis 		if (U8_HANGUL_JAMO_V(u1)) {
8125049Sis 			if (*state == U8_STATE_HANGUL_L)
8135049Sis 				*state = U8_STATE_HANGUL_LV;
8145049Sis 			else
8155049Sis 				*state = U8_STATE_HANGUL_V;
8165049Sis 			return (3);
8175049Sis 		}
8185049Sis 
8195049Sis 		if (U8_HANGUL_JAMO_T(u1)) {
8205049Sis 			if (*state == U8_STATE_HANGUL_LV)
8215049Sis 				*state = U8_STATE_HANGUL_LVT;
8225049Sis 			else
8235049Sis 				*state = U8_STATE_HANGUL_T;
8245049Sis 			return (3);
8255049Sis 		}
8265049Sis 	} else if (sz == 4) {
8275049Sis 		b1 = u8s[0] = s[0];
8285049Sis 		b2 = u8s[1] = s[1];
8295049Sis 		b3 = u8s[2] = s[2];
8305049Sis 		b4 = u8s[3] = s[3];
8315049Sis 		u8s[4] = '\0';
8325049Sis 	} else {
8335049Sis 		/*
8345049Sis 		 * This is a fallback and should not happen if the function
8355049Sis 		 * was called properly.
8365049Sis 		 */
8375049Sis 		u8s[0] = s[0];
8385049Sis 		u8s[1] = '\0';
8395049Sis 		*state = U8_STATE_START;
8405049Sis 		return (1);
8415049Sis 	}
8425049Sis 
8435049Sis 	/*
8445049Sis 	 * At this point, this rountine does not know what it would get.
8455049Sis 	 * The caller should sort it out if the state isn't a Hangul one.
8465049Sis 	 */
8475049Sis 	*state = U8_STATE_START;
8485049Sis 
8495049Sis 	/* Try to find matching decomposition mapping byte sequence. */
8505049Sis 	b1 = u8_common_b1_tbl[uv][b1];
8515049Sis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8525049Sis 		return ((size_t)sz);
8535049Sis 
8545049Sis 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
8555049Sis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8565049Sis 		return ((size_t)sz);
8575049Sis 
8585049Sis 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8595049Sis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8605049Sis 		return ((size_t)sz);
8615049Sis 
8625049Sis 	/*
8635049Sis 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8645049Sis 	 * which is 0x8000, this means we couldn't fit the mappings into
8655049Sis 	 * the cardinality of a unsigned byte.
8665049Sis 	 */
8675049Sis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8685049Sis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8695049Sis 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8705049Sis 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8715049Sis 	} else {
8725049Sis 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8735049Sis 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8745049Sis 	}
8755049Sis 
8765049Sis 	/* This also means there wasn't any matching decomposition. */
8775049Sis 	if (start_id >= end_id)
8785049Sis 		return ((size_t)sz);
8795049Sis 
8805049Sis 	/*
8815049Sis 	 * The final table for decomposition mappings has three types of
8825049Sis 	 * byte sequences depending on whether a mapping is for compatibility
8835049Sis 	 * decomposition, canonical decomposition, or both like the following:
8845049Sis 	 *
8855049Sis 	 * (1) Compatibility decomposition mappings:
8865049Sis 	 *
8875049Sis 	 *	+---+---+-...-+---+
8885049Sis 	 *	| B0| B1| ... | Bm|
8895049Sis 	 *	+---+---+-...-+---+
8905049Sis 	 *
8915049Sis 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8925049Sis 	 *
8935049Sis 	 * (2) Canonical decomposition mappings:
8945049Sis 	 *
8955049Sis 	 *	+---+---+---+-...-+---+
8965049Sis 	 *	| T | b0| b1| ... | bn|
8975049Sis 	 *	+---+---+---+-...-+---+
8985049Sis 	 *
8995049Sis 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9005049Sis 	 *
9015049Sis 	 * (3) Both mappings:
9025049Sis 	 *
9035049Sis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9045049Sis 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9055049Sis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9065049Sis 	 *
9075049Sis 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9085049Sis 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9095049Sis 	 *	compatibility mapping bytes.
9105049Sis 	 *
9115049Sis 	 * Note that compatibility decomposition means doing recursive
9125049Sis 	 * decompositions using both compatibility decomposition mappings and
9135049Sis 	 * canonical decomposition mappings. On the other hand, canonical
9145049Sis 	 * decomposition means doing recursive decompositions using only
9155049Sis 	 * canonical decomposition mappings. Since the table we have has gone
9165049Sis 	 * through the recursions already, we do not need to do so during
9175049Sis 	 * runtime, i.e., the table has been completely flattened out
9185049Sis 	 * already.
9195049Sis 	 */
9205049Sis 
9215049Sis 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9225049Sis 
9235049Sis 	/* Get the type, T, of the byte sequence. */
9245049Sis 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9255049Sis 
9265049Sis 	/*
9275049Sis 	 * If necessary, adjust start_id, end_id, or both. Note that if
9285049Sis 	 * this is compatibility decomposition mapping, there is no
9295049Sis 	 * adjustment.
9305049Sis 	 */
9315049Sis 	if (canonical_decomposition) {
9325049Sis 		/* Is the mapping only for compatibility decomposition? */
9335049Sis 		if (b1 < U8_DECOMP_BOTH)
9345049Sis 			return ((size_t)sz);
9355049Sis 
9365049Sis 		start_id++;
9375049Sis 
9385049Sis 		if (b1 == U8_DECOMP_BOTH) {
9395049Sis 			end_id = start_id +
9405049Sis 			    u8_decomp_final_tbl[uv][b3_base + start_id];
9415049Sis 			start_id++;
9425049Sis 		}
9435049Sis 	} else {
9445049Sis 		/*
9455049Sis 		 * Unless this is a compatibility decomposition mapping,
9465049Sis 		 * we adjust the start_id.
9475049Sis 		 */
9485049Sis 		if (b1 == U8_DECOMP_BOTH) {
9495049Sis 			start_id++;
9505049Sis 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9515049Sis 		} else if (b1 == U8_DECOMP_CANONICAL) {
9525049Sis 			start_id++;
9535049Sis 		}
9545049Sis 	}
9555049Sis 
9565049Sis 	for (i = 0; start_id < end_id; start_id++)
9575049Sis 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9585049Sis 	u8s[i] = '\0';
9595049Sis 
9605049Sis 	return (i);
9615049Sis }
9625049Sis 
9635049Sis /*
9645049Sis  * The find_composition_start() function uses the character bytes given and
9655049Sis  * find out the matching composition mappings if any and return the address
9665049Sis  * to the composition mappings as explained in the do_composition().
9675049Sis  */
9685049Sis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9695049Sis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9705049Sis {
9715049Sis 	uint16_t b1 = 0;
9725049Sis 	uint16_t b2 = 0;
9735049Sis 	uint16_t b3 = 0;
9745049Sis 	uint16_t b3_tbl;
9755049Sis 	uint16_t b3_base;
9765049Sis 	uint16_t b4 = 0;
9775049Sis 	size_t start_id;
9785049Sis 	size_t end_id;
9795049Sis 
9805049Sis 	if (sz == 1) {
9815049Sis 		b4 = s[0];
9825049Sis 	} else if (sz == 2) {
9835049Sis 		b3 = s[0];
9845049Sis 		b4 = s[1];
9855049Sis 	} else if (sz == 3) {
9865049Sis 		b2 = s[0];
9875049Sis 		b3 = s[1];
9885049Sis 		b4 = s[2];
9895049Sis 	} else if (sz == 4) {
9905049Sis 		b1 = s[0];
9915049Sis 		b2 = s[1];
9925049Sis 		b3 = s[2];
9935049Sis 		b4 = s[3];
9945049Sis 	} else {
9955049Sis 		/*
9965049Sis 		 * This is a fallback and should not happen if the function
9975049Sis 		 * was called properly.
9985049Sis 		 */
9995049Sis 		return (NULL);
10005049Sis 	}
10015049Sis 
10025049Sis 	b1 = u8_composition_b1_tbl[uv][b1];
10035049Sis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10045049Sis 		return (NULL);
10055049Sis 
10065049Sis 	b2 = u8_composition_b2_tbl[uv][b1][b2];
10075049Sis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10085049Sis 		return (NULL);
10095049Sis 
10105049Sis 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10115049Sis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10125049Sis 		return (NULL);
10135049Sis 
10145049Sis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10155049Sis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10165049Sis 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10175049Sis 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10185049Sis 	} else {
10195049Sis 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10205049Sis 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10215049Sis 	}
10225049Sis 
10235049Sis 	if (start_id >= end_id)
10245049Sis 		return (NULL);
10255049Sis 
10265049Sis 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10275049Sis 
10285049Sis 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10295049Sis }
10305049Sis 
10315049Sis /*
10325049Sis  * The blocked() function checks on the combining class values of previous
10335049Sis  * characters in this sequence and return whether it is blocked or not.
10345049Sis  */
10355049Sis static boolean_t
blocked(uchar_t * comb_class,size_t last)10365049Sis blocked(uchar_t *comb_class, size_t last)
10375049Sis {
10385049Sis 	uchar_t my_comb_class;
10395049Sis 	size_t i;
10405049Sis 
10415049Sis 	my_comb_class = comb_class[last];
10425049Sis 	for (i = 1; i < last; i++)
10435049Sis 		if (comb_class[i] >= my_comb_class ||
10445049Sis 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
10455049Sis 			return (B_TRUE);
10465049Sis 
10475049Sis 	return (B_FALSE);
10485049Sis }
10495049Sis 
10505049Sis /*
10515049Sis  * The do_composition() reads the character string pointed by 's' and
10525049Sis  * do necessary canonical composition and then copy over the result back to
10535049Sis  * the 's'.
10545049Sis  *
10555049Sis  * The input argument 's' cannot contain more than 32 characters.
10565049Sis  */
10575049Sis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10585049Sis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10595049Sis 	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10605049Sis {
10615049Sis 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10625049Sis 	uchar_t tc[U8_MB_CUR_MAX];
10635049Sis 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10645049Sis 	size_t saved_marks_count;
10655049Sis 	uchar_t *p;
10665049Sis 	uchar_t *saved_p;
10675049Sis 	uchar_t *q;
10685049Sis 	size_t i;
10695049Sis 	size_t saved_i;
10705049Sis 	size_t j;
10715049Sis 	size_t k;
10725049Sis 	size_t l;
10735049Sis 	size_t C;
10745049Sis 	size_t saved_l;
10755049Sis 	size_t size;
10765049Sis 	uint32_t u1;
10775049Sis 	uint32_t u2;
10785049Sis 	boolean_t match_not_found = B_TRUE;
10795049Sis 
10805049Sis 	/*
10815049Sis 	 * This should never happen unless the callers are doing some strange
10825049Sis 	 * and unexpected things.
10835049Sis 	 *
10845049Sis 	 * The "last" is the index pointing to the last character not last + 1.
10855049Sis 	 */
10865049Sis 	if (last >= U8_MAX_CHARS_A_SEQ)
10875049Sis 		last = U8_UPPER_LIMIT_IN_A_SEQ;
10885049Sis 
10895049Sis 	for (i = l = 0; i <= last; i++) {
10905049Sis 		/*
10915049Sis 		 * The last or any non-Starters at the beginning, we don't
10925049Sis 		 * have any chance to do composition and so we just copy them
10935049Sis 		 * to the temporary buffer.
10945049Sis 		 */
10955049Sis 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10965049Sis SAVE_THE_CHAR:
10975049Sis 			p = s + start[i];
10985049Sis 			size = disp[i];
10995049Sis 			for (k = 0; k < size; k++)
11005049Sis 				t[l++] = *p++;
11015049Sis 			continue;
11025049Sis 		}
11035049Sis 
11045049Sis 		/*
11055049Sis 		 * If this could be a start of Hangul Jamos, then, we try to
11065049Sis 		 * conjoin them.
11075049Sis 		 */
11085049Sis 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11095049Sis 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11105049Sis 			    s[start[i] + 1], s[start[i] + 2]);
11115049Sis 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11125049Sis 			    s[start[i] + 4], s[start[i] + 5]);
11135049Sis 
11145049Sis 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11155049Sis 				u1 -= U8_HANGUL_JAMO_L_FIRST;
11165049Sis 				u2 -= U8_HANGUL_JAMO_V_FIRST;
11175049Sis 				u1 = U8_HANGUL_SYL_FIRST +
11185049Sis 				    (u1 * U8_HANGUL_V_COUNT + u2) *
11195049Sis 				    U8_HANGUL_T_COUNT;
11205049Sis 
11215049Sis 				i += 2;
11225049Sis 				if (i <= last) {
11235049Sis 					U8_PUT_3BYTES_INTO_UTF32(u2,
11245049Sis 					    s[start[i]], s[start[i] + 1],
11255049Sis 					    s[start[i] + 2]);
11265049Sis 
11275049Sis 					if (U8_HANGUL_JAMO_T(u2)) {
11285049Sis 						u1 += u2 -
11295049Sis 						    U8_HANGUL_JAMO_T_FIRST;
11305049Sis 						i++;
11315049Sis 					}
11325049Sis 				}
11335049Sis 
11345049Sis 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11355049Sis 				i--;
11365049Sis 				l += 3;
11375049Sis 				continue;
11385049Sis 			}
11395049Sis 		}
11405049Sis 
11415049Sis 		/*
11425049Sis 		 * Let's then find out if this Starter has composition
11435049Sis 		 * mapping.
11445049Sis 		 */
11455049Sis 		p = find_composition_start(uv, s + start[i], disp[i]);
11465049Sis 		if (p == NULL)
11475049Sis 			goto SAVE_THE_CHAR;
11485049Sis 
11495049Sis 		/*
11505049Sis 		 * We have a Starter with composition mapping and the next
11515049Sis 		 * character is a non-Starter. Let's try to find out if
11525049Sis 		 * we can do composition.
11535049Sis 		 */
11545049Sis 
11555049Sis 		saved_p = p;
11565049Sis 		saved_i = i;
11575049Sis 		saved_l = l;
11585049Sis 		saved_marks_count = 0;
11595049Sis 
11605049Sis TRY_THE_NEXT_MARK:
11615049Sis 		q = s + start[++i];
11625049Sis 		size = disp[i];
11635049Sis 
11645049Sis 		/*
11655049Sis 		 * The next for() loop compares the non-Starter pointed by
11665049Sis 		 * 'q' with the possible (joinable) characters pointed by 'p'.
11675049Sis 		 *
11685049Sis 		 * The composition final table entry pointed by the 'p'
11695049Sis 		 * looks like the following:
11705049Sis 		 *
11715049Sis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11725049Sis 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11735049Sis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11745049Sis 		 *
11755049Sis 		 * where C is the count byte indicating the number of
11765049Sis 		 * mapping pairs where each pair would be look like
11775049Sis 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11785049Sis 		 * character of a canonical decomposition and the B0-Bm are
11795049Sis 		 * the bytes of a matching composite character. The F is
11805049Sis 		 * a filler byte after each character as the separator.
11815049Sis 		 */
11825049Sis 
11835049Sis 		match_not_found = B_TRUE;
11845049Sis 
11855049Sis 		for (C = *p++; C > 0; C--) {
11865049Sis 			for (k = 0; k < size; p++, k++)
11875049Sis 				if (*p != q[k])
11885049Sis 					break;
11895049Sis 
11905049Sis 			/* Have we found it? */
11915049Sis 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11925049Sis 				match_not_found = B_FALSE;
11935049Sis 
11945049Sis 				l = saved_l;
11955049Sis 
11965049Sis 				while (*++p != U8_TBL_ELEMENT_FILLER)
11975049Sis 					t[l++] = *p;
11985049Sis 
11995049Sis 				break;
12005049Sis 			}
12015049Sis 
12025049Sis 			/* We didn't find; skip to the next pair. */
12035049Sis 			if (*p != U8_TBL_ELEMENT_FILLER)
12045049Sis 				while (*++p != U8_TBL_ELEMENT_FILLER)
12055049Sis 					;
12065049Sis 			while (*++p != U8_TBL_ELEMENT_FILLER)
12075049Sis 				;
12085049Sis 			p++;
12095049Sis 		}
12105049Sis 
12115049Sis 		/*
12125049Sis 		 * If there was no match, we will need to save the combining
12135049Sis 		 * mark for later appending. After that, if the next one
12145049Sis 		 * is a non-Starter and not blocked, then, we try once
12155049Sis 		 * again to do composition with the next non-Starter.
12165049Sis 		 *
12175049Sis 		 * If there was no match and this was a Starter, then,
12185049Sis 		 * this is a new start.
12195049Sis 		 *
12205049Sis 		 * If there was a match and a composition done and we have
12215049Sis 		 * more to check on, then, we retrieve a new composition final
12225049Sis 		 * table entry for the composite and then try to do the
12235049Sis 		 * composition again.
12245049Sis 		 */
12255049Sis 
12265049Sis 		if (match_not_found) {
12275049Sis 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12285049Sis 				i--;
12295049Sis 				goto SAVE_THE_CHAR;
12305049Sis 			}
12315049Sis 
12325049Sis 			saved_marks[saved_marks_count++] = i;
12335049Sis 		}
12345049Sis 
12355049Sis 		if (saved_l == l) {
12365049Sis 			while (i < last) {
12375049Sis 				if (blocked(comb_class, i + 1))
12385049Sis 					saved_marks[saved_marks_count++] = ++i;
12395049Sis 				else
12405049Sis 					break;
12415049Sis 			}
12425049Sis 			if (i < last) {
12435049Sis 				p = saved_p;
12445049Sis 				goto TRY_THE_NEXT_MARK;
12455049Sis 			}
12465049Sis 		} else if (i < last) {
12475049Sis 			p = find_composition_start(uv, t + saved_l,
12485049Sis 			    l - saved_l);
12495049Sis 			if (p != NULL) {
12505049Sis 				saved_p = p;
12515049Sis 				goto TRY_THE_NEXT_MARK;
12525049Sis 			}
12535049Sis 		}
12545049Sis 
12555049Sis 		/*
12565049Sis 		 * There is no more composition possible.
12575049Sis 		 *
12585049Sis 		 * If there was no composition what so ever then we copy
12595049Sis 		 * over the original Starter and then append any non-Starters
12605049Sis 		 * remaining at the target string sequentially after that.
12615049Sis 		 */
12625049Sis 
12635049Sis 		if (saved_l == l) {
12645049Sis 			p = s + start[saved_i];
12655049Sis 			size = disp[saved_i];
12665049Sis 			for (j = 0; j < size; j++)
12675049Sis 				t[l++] = *p++;
12685049Sis 		}
12695049Sis 
12705049Sis 		for (k = 0; k < saved_marks_count; k++) {
12715049Sis 			p = s + start[saved_marks[k]];
12725049Sis 			size = disp[saved_marks[k]];
12735049Sis 			for (j = 0; j < size; j++)
12745049Sis 				t[l++] = *p++;
12755049Sis 		}
12765049Sis 	}
12775049Sis 
12785049Sis 	/*
12795049Sis 	 * If the last character is a Starter and if we have a character
12805049Sis 	 * (possibly another Starter) that can be turned into a composite,
12815049Sis 	 * we do so and we do so until there is no more of composition
12825049Sis 	 * possible.
12835049Sis 	 */
12845049Sis 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12855049Sis 		p = *os;
12865049Sis 		saved_l = l - disp[last];
12875049Sis 
12885049Sis 		while (p < oslast) {
12895049Sis 			size = u8_number_of_bytes[*p];
12905049Sis 			if (size <= 1 || (p + size) > oslast)
12915049Sis 				break;
12925049Sis 
12935049Sis 			saved_p = p;
12945049Sis 
12955049Sis 			for (i = 0; i < size; i++)
12965049Sis 				tc[i] = *p++;
12975049Sis 
12985049Sis 			q = find_composition_start(uv, t + saved_l,
12995049Sis 			    l - saved_l);
13005049Sis 			if (q == NULL) {
13015049Sis 				p = saved_p;
13025049Sis 				break;
13035049Sis 			}
13045049Sis 
13055049Sis 			match_not_found = B_TRUE;
13065049Sis 
13075049Sis 			for (C = *q++; C > 0; C--) {
13085049Sis 				for (k = 0; k < size; q++, k++)
13095049Sis 					if (*q != tc[k])
13105049Sis 						break;
13115049Sis 
13125049Sis 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13135049Sis 					match_not_found = B_FALSE;
13145049Sis 
13155049Sis 					l = saved_l;
13165049Sis 
13175049Sis 					while (*++q != U8_TBL_ELEMENT_FILLER) {
13185049Sis 						/*
13195049Sis 						 * This is practically
13205049Sis 						 * impossible but we don't
13215049Sis 						 * want to take any chances.
13225049Sis 						 */
13235049Sis 						if (l >=
13245049Sis 						    U8_STREAM_SAFE_TEXT_MAX) {
13255049Sis 							p = saved_p;
13265049Sis 							goto SAFE_RETURN;
13275049Sis 						}
13285049Sis 						t[l++] = *q;
13295049Sis 					}
13305049Sis 
13315049Sis 					break;
13325049Sis 				}
13335049Sis 
13345049Sis 				if (*q != U8_TBL_ELEMENT_FILLER)
13355049Sis 					while (*++q != U8_TBL_ELEMENT_FILLER)
13365049Sis 						;
13375049Sis 				while (*++q != U8_TBL_ELEMENT_FILLER)
13385049Sis 					;
13395049Sis 				q++;
13405049Sis 			}
13415049Sis 
13425049Sis 			if (match_not_found) {
13435049Sis 				p = saved_p;
13445049Sis 				break;
13455049Sis 			}
13465049Sis 		}
13475049Sis SAFE_RETURN:
13485049Sis 		*os = p;
13495049Sis 	}
13505049Sis 
13515049Sis 	/*
13525049Sis 	 * Now we copy over the temporary string to the target string.
13535049Sis 	 * Since composition always reduces the number of characters or
13545049Sis 	 * the number of characters stay, we don't need to worry about
13555049Sis 	 * the buffer overflow here.
13565049Sis 	 */
13575049Sis 	for (i = 0; i < l; i++)
13585049Sis 		s[i] = t[i];
13595049Sis 	s[l] = '\0';
13605049Sis 
13615049Sis 	return (l);
13625049Sis }
13635049Sis 
13645049Sis /*
13655049Sis  * The collect_a_seq() function checks on the given string s, collect
13665049Sis  * a sequence of characters at u8s, and return the sequence. While it collects
13675049Sis  * a sequence, it also applies case conversion, canonical or compatibility
13685049Sis  * decomposition, canonical decomposition, or some or all of them and
13695049Sis  * in that order.
13705049Sis  *
13715049Sis  * The collected sequence cannot be bigger than 32 characters since if
13725049Sis  * it is having more than 31 characters, the sequence will be terminated
13735049Sis  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13745049Sis  * a Stream-Safe Text. The collected sequence is always terminated with
13755049Sis  * a null byte and the return value is the byte length of the sequence
13765049Sis  * including 0. The return value does not include the terminating
13775049Sis  * null byte.
13785049Sis  */
13795049Sis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13805049Sis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13815049Sis 	boolean_t is_it_toupper,
13825049Sis 	boolean_t is_it_tolower,
13835049Sis 	boolean_t canonical_decomposition,
13845049Sis 	boolean_t compatibility_decomposition,
13855049Sis 	boolean_t canonical_composition,
1386*7012Sis 	int *errnum, u8_normalization_states_t *state)
13875049Sis {
13885049Sis 	uchar_t *s;
13895049Sis 	int sz;
13905049Sis 	int saved_sz;
13915049Sis 	size_t i;
13925049Sis 	size_t j;
13935049Sis 	size_t k;
13945049Sis 	size_t l;
13955049Sis 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
13965049Sis 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
13975049Sis 	uchar_t start[U8_MAX_CHARS_A_SEQ];
13985049Sis 	uchar_t u8t[U8_MB_CUR_MAX];
13995049Sis 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14005049Sis 	uchar_t tc;
14015049Sis 	size_t last;
14025049Sis 	size_t saved_last;
14035049Sis 	uint32_t u1;
14045049Sis 
14055049Sis 	/*
14065049Sis 	 * Save the source string pointer which we will return a changed
14075049Sis 	 * pointer if we do processing.
14085049Sis 	 */
14095049Sis 	s = *source;
14105049Sis 
14115049Sis 	/*
14125049Sis 	 * The following is a fallback for just in case callers are not
14135049Sis 	 * checking the string boundaries before the calling.
14145049Sis 	 */
14155049Sis 	if (s >= slast) {
14165049Sis 		u8s[0] = '\0';
14175049Sis 
14185049Sis 		return (0);
14195049Sis 	}
14205049Sis 
14215049Sis 	/*
14225049Sis 	 * As the first thing, let's collect a character and do case
14235049Sis 	 * conversion if necessary.
14245049Sis 	 */
14255049Sis 
14265049Sis 	sz = u8_number_of_bytes[*s];
14275049Sis 
14285049Sis 	if (sz < 0) {
1429*7012Sis 		*errnum = EILSEQ;
14305049Sis 
14315049Sis 		u8s[0] = *s++;
14325049Sis 		u8s[1] = '\0';
14335049Sis 
14345049Sis 		*source = s;
14355049Sis 
14365049Sis 		return (1);
14375049Sis 	}
14385049Sis 
14395049Sis 	if (sz == 1) {
14405049Sis 		if (is_it_toupper)
14415049Sis 			u8s[0] = U8_ASCII_TOUPPER(*s);
14425049Sis 		else if (is_it_tolower)
14435049Sis 			u8s[0] = U8_ASCII_TOLOWER(*s);
14445049Sis 		else
14455049Sis 			u8s[0] = *s;
14465049Sis 		s++;
14475049Sis 		u8s[1] = '\0';
14485049Sis 	} else if ((s + sz) > slast) {
1449*7012Sis 		*errnum = EINVAL;
14505049Sis 
14515049Sis 		for (i = 0; s < slast; )
14525049Sis 			u8s[i++] = *s++;
14535049Sis 		u8s[i] = '\0';
14545049Sis 
14555049Sis 		*source = s;
14565049Sis 
14575049Sis 		return (i);
14585049Sis 	} else {
14595049Sis 		if (is_it_toupper || is_it_tolower) {
14605049Sis 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14615049Sis 			s += sz;
14625049Sis 			sz = i;
14635049Sis 		} else {
14645049Sis 			for (i = 0; i < sz; )
14655049Sis 				u8s[i++] = *s++;
14665049Sis 			u8s[i] = '\0';
14675049Sis 		}
14685049Sis 	}
14695049Sis 
14705049Sis 	/*
14715049Sis 	 * And then canonical/compatibility decomposition followed by
14725049Sis 	 * an optional canonical composition. Please be noted that
14735049Sis 	 * canonical composition is done only when a decomposition is
14745049Sis 	 * done.
14755049Sis 	 */
14765049Sis 	if (canonical_decomposition || compatibility_decomposition) {
14775049Sis 		if (sz == 1) {
14785049Sis 			*state = U8_STATE_START;
14795049Sis 
14805049Sis 			saved_sz = 1;
14815049Sis 
14825049Sis 			comb_class[0] = 0;
14835049Sis 			start[0] = 0;
14845049Sis 			disp[0] = 1;
14855049Sis 
14865049Sis 			last = 1;
14875049Sis 		} else {
14885049Sis 			saved_sz = do_decomp(uv, u8s, u8s, sz,
14895049Sis 			    canonical_decomposition, state);
14905049Sis 
14915049Sis 			last = 0;
14925049Sis 
14935049Sis 			for (i = 0; i < saved_sz; ) {
14945049Sis 				sz = u8_number_of_bytes[u8s[i]];
14955049Sis 
14965049Sis 				comb_class[last] = combining_class(uv,
14975049Sis 				    u8s + i, sz);
14985049Sis 				start[last] = i;
14995049Sis 				disp[last] = sz;
15005049Sis 
15015049Sis 				last++;
15025049Sis 				i += sz;
15035049Sis 			}
15045049Sis 
15055049Sis 			/*
15065049Sis 			 * Decomposition yields various Hangul related
15075049Sis 			 * states but not on combining marks. We need to
15085049Sis 			 * find out at here by checking on the last
15095049Sis 			 * character.
15105049Sis 			 */
15115049Sis 			if (*state == U8_STATE_START) {
15125049Sis 				if (comb_class[last - 1])
15135049Sis 					*state = U8_STATE_COMBINING_MARK;
15145049Sis 			}
15155049Sis 		}
15165049Sis 
15175049Sis 		saved_last = last;
15185049Sis 
15195049Sis 		while (s < slast) {
15205049Sis 			sz = u8_number_of_bytes[*s];
15215049Sis 
15225049Sis 			/*
15235049Sis 			 * If this is an illegal character, an incomplete
15245049Sis 			 * character, or an 7-bit ASCII Starter character,
15255049Sis 			 * then we have collected a sequence; break and let
15265049Sis 			 * the next call deal with the two cases.
15275049Sis 			 *
15285049Sis 			 * Note that this is okay only if you are using this
15295049Sis 			 * function with a fixed length string, not on
15305049Sis 			 * a buffer with multiple calls of one chunk at a time.
15315049Sis 			 */
15325049Sis 			if (sz <= 1) {
15335049Sis 				break;
15345049Sis 			} else if ((s + sz) > slast) {
15355049Sis 				break;
15365049Sis 			} else {
15375049Sis 				/*
15385049Sis 				 * If the previous character was a Hangul Jamo
15395049Sis 				 * and this character is a Hangul Jamo that
15405049Sis 				 * can be conjoined, we collect the Jamo.
15415049Sis 				 */
15425049Sis 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15435049Sis 					U8_PUT_3BYTES_INTO_UTF32(u1,
15445049Sis 					    *s, *(s + 1), *(s + 2));
15455049Sis 
15465049Sis 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
15475049Sis 					    u1)) {
15485049Sis 						i = 0;
15495049Sis 						*state = U8_STATE_HANGUL_LV;
15505049Sis 						goto COLLECT_A_HANGUL;
15515049Sis 					}
15525049Sis 
15535049Sis 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15545049Sis 					    u1)) {
15555049Sis 						i = 0;
15565049Sis 						*state = U8_STATE_HANGUL_LVT;
15575049Sis 						goto COLLECT_A_HANGUL;
15585049Sis 					}
15595049Sis 				}
15605049Sis 
15615049Sis 				/*
15625049Sis 				 * Regardless of whatever it was, if this is
15635049Sis 				 * a Starter, we don't collect the character
15645049Sis 				 * since that's a new start and we will deal
15655049Sis 				 * with it at the next time.
15665049Sis 				 */
15675049Sis 				i = combining_class(uv, s, sz);
15685049Sis 				if (i == U8_COMBINING_CLASS_STARTER)
15695049Sis 					break;
15705049Sis 
15715049Sis 				/*
15725049Sis 				 * We know the current character is a combining
15735049Sis 				 * mark. If the previous character wasn't
15745049Sis 				 * a Starter (not Hangul) or a combining mark,
15755049Sis 				 * then, we don't collect this combining mark.
15765049Sis 				 */
15775049Sis 				if (*state != U8_STATE_START &&
15785049Sis 				    *state != U8_STATE_COMBINING_MARK)
15795049Sis 					break;
15805049Sis 
15815049Sis 				*state = U8_STATE_COMBINING_MARK;
15825049Sis COLLECT_A_HANGUL:
15835049Sis 				/*
15845049Sis 				 * If we collected a Starter and combining
15855049Sis 				 * marks up to 30, i.e., total 31 characters,
15865049Sis 				 * then, we terminate this degenerately long
15875049Sis 				 * combining sequence with a U+034F COMBINING
15885049Sis 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15895049Sis 				 * UTF-8 and turn this into a Stream-Safe
15905049Sis 				 * Text. This will be extremely rare but
15915049Sis 				 * possible.
15925049Sis 				 *
15935049Sis 				 * The following will also guarantee that
15945049Sis 				 * we are not writing more than 32 characters
15955049Sis 				 * plus a NULL at u8s[].
15965049Sis 				 */
15975049Sis 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
15985049Sis TURN_STREAM_SAFE:
15995049Sis 					*state = U8_STATE_START;
16005049Sis 					comb_class[last] = 0;
16015049Sis 					start[last] = saved_sz;
16025049Sis 					disp[last] = 2;
16035049Sis 					last++;
16045049Sis 
16055049Sis 					u8s[saved_sz++] = 0xCD;
16065049Sis 					u8s[saved_sz++] = 0x8F;
16075049Sis 
16085049Sis 					break;
16095049Sis 				}
16105049Sis 
16115049Sis 				/*
16125049Sis 				 * Some combining marks also do decompose into
16135049Sis 				 * another combining mark or marks.
16145049Sis 				 */
16155049Sis 				if (*state == U8_STATE_COMBINING_MARK) {
16165049Sis 					k = last;
16175049Sis 					l = sz;
16185049Sis 					i = do_decomp(uv, uts, s, sz,
16195049Sis 					    canonical_decomposition, state);
16205049Sis 					for (j = 0; j < i; ) {
16215049Sis 						sz = u8_number_of_bytes[uts[j]];
16225049Sis 
16235049Sis 						comb_class[last] =
16245049Sis 						    combining_class(uv,
16255049Sis 						    uts + j, sz);
16265049Sis 						start[last] = saved_sz + j;
16275049Sis 						disp[last] = sz;
16285049Sis 
16295049Sis 						last++;
16305049Sis 						if (last >=
16315049Sis 						    U8_UPPER_LIMIT_IN_A_SEQ) {
16325049Sis 							last = k;
16335049Sis 							goto TURN_STREAM_SAFE;
16345049Sis 						}
16355049Sis 						j += sz;
16365049Sis 					}
16375049Sis 
16385049Sis 					*state = U8_STATE_COMBINING_MARK;
16395049Sis 					sz = i;
16405049Sis 					s += l;
16415049Sis 
16425049Sis 					for (i = 0; i < sz; i++)
16435049Sis 						u8s[saved_sz++] = uts[i];
16445049Sis 				} else {
16455049Sis 					comb_class[last] = i;
16465049Sis 					start[last] = saved_sz;
16475049Sis 					disp[last] = sz;
16485049Sis 					last++;
16495049Sis 
16505049Sis 					for (i = 0; i < sz; i++)
16515049Sis 						u8s[saved_sz++] = *s++;
16525049Sis 				}
16535049Sis 
16545049Sis 				/*
16555049Sis 				 * If this is U+0345 COMBINING GREEK
16565049Sis 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16575049Sis 				 * iota subscript, and need to be converted to
16585049Sis 				 * uppercase letter, convert it to U+0399 GREEK
16595049Sis 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16605049Sis 				 * i.e., convert to capital adscript form as
16615049Sis 				 * specified in the Unicode standard.
16625049Sis 				 *
16635049Sis 				 * This is the only special case of (ambiguous)
16645049Sis 				 * case conversion at combining marks and
16655049Sis 				 * probably the standard will never have
16665049Sis 				 * anything similar like this in future.
16675049Sis 				 */
16685049Sis 				if (is_it_toupper && sz >= 2 &&
16695049Sis 				    u8s[saved_sz - 2] == 0xCD &&
16705049Sis 				    u8s[saved_sz - 1] == 0x85) {
16715049Sis 					u8s[saved_sz - 2] = 0xCE;
16725049Sis 					u8s[saved_sz - 1] = 0x99;
16735049Sis 				}
16745049Sis 			}
16755049Sis 		}
16765049Sis 
16775049Sis 		/*
16785049Sis 		 * Let's try to ensure a canonical ordering for the collected
16795049Sis 		 * combining marks. We do this only if we have collected
16805049Sis 		 * at least one more non-Starter. (The decomposition mapping
16815049Sis 		 * data tables have fully (and recursively) expanded and
16825049Sis 		 * canonically ordered decompositions.)
16835049Sis 		 *
16845049Sis 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
16855049Sis 		 * assumptions and we are meeting the assumptions.
16865049Sis 		 */
16875049Sis 		last--;
16885049Sis 		if (last >= saved_last) {
16895049Sis 			for (i = 0; i < last; i++)
16905049Sis 				for (j = last; j > i; j--)
16915049Sis 					if (comb_class[j] &&
16925049Sis 					    comb_class[j - 1] > comb_class[j]) {
16935049Sis 						U8_SWAP_COMB_MARKS(j - 1, j);
16945049Sis 					}
16955049Sis 		}
16965049Sis 
16975049Sis 		*source = s;
16985049Sis 
16995049Sis 		if (! canonical_composition) {
17005049Sis 			u8s[saved_sz] = '\0';
17015049Sis 			return (saved_sz);
17025049Sis 		}
17035049Sis 
17045049Sis 		/*
17055049Sis 		 * Now do the canonical composition. Note that we do this
17065049Sis 		 * only after a canonical or compatibility decomposition to
17075049Sis 		 * finish up NFC or NFKC.
17085049Sis 		 */
17095049Sis 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
17105049Sis 		    &s, slast);
17115049Sis 	}
17125049Sis 
17135049Sis 	*source = s;
17145049Sis 
17155049Sis 	return ((size_t)sz);
17165049Sis }
17175049Sis 
17185049Sis /*
17195049Sis  * The do_norm_compare() function does string comparion based on Unicode
17205049Sis  * simple case mappings and Unicode Normalization definitions.
17215049Sis  *
17225049Sis  * It does so by collecting a sequence of character at a time and comparing
17235049Sis  * the collected sequences from the strings.
17245049Sis  *
17255049Sis  * The meanings on the return values are the same as the usual strcmp().
17265049Sis  */
17275049Sis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17285049Sis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729*7012Sis 	int flag, int *errnum)
17305049Sis {
17315049Sis 	int result;
17325049Sis 	size_t sz1;
17335049Sis 	size_t sz2;
17345049Sis 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17355049Sis 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17365049Sis 	uchar_t *s1last;
17375049Sis 	uchar_t *s2last;
17385049Sis 	boolean_t is_it_toupper;
17395049Sis 	boolean_t is_it_tolower;
17405049Sis 	boolean_t canonical_decomposition;
17415049Sis 	boolean_t compatibility_decomposition;
17425049Sis 	boolean_t canonical_composition;
17435049Sis 	u8_normalization_states_t state;
17445049Sis 
17455049Sis 	s1last = s1 + n1;
17465049Sis 	s2last = s2 + n2;
17475049Sis 
17485049Sis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17495049Sis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17505049Sis 	canonical_decomposition = flag & U8_CANON_DECOMP;
17515049Sis 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17525049Sis 	canonical_composition = flag & U8_CANON_COMP;
17535049Sis 
17545049Sis 	while (s1 < s1last && s2 < s2last) {
17555049Sis 		/*
17565049Sis 		 * If the current character is a 7-bit ASCII and the last
17575049Sis 		 * character, or, if the current character and the next
17585049Sis 		 * character are both some 7-bit ASCII characters then
17595049Sis 		 * we treat the current character as a sequence.
17605049Sis 		 *
17615049Sis 		 * In any other cases, we need to call collect_a_seq().
17625049Sis 		 */
17635049Sis 
17645049Sis 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17655049Sis 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17665049Sis 			if (is_it_toupper)
17675049Sis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
17685049Sis 			else if (is_it_tolower)
17695049Sis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
17705049Sis 			else
17715049Sis 				u8s1[0] = *s1;
17725049Sis 			u8s1[1] = '\0';
17735049Sis 			sz1 = 1;
17745049Sis 			s1++;
17755049Sis 		} else {
17765049Sis 			state = U8_STATE_START;
17775049Sis 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17785049Sis 			    is_it_toupper, is_it_tolower,
17795049Sis 			    canonical_decomposition,
17805049Sis 			    compatibility_decomposition,
1781*7012Sis 			    canonical_composition, errnum, &state);
17825049Sis 		}
17835049Sis 
17845049Sis 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17855049Sis 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17865049Sis 			if (is_it_toupper)
17875049Sis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
17885049Sis 			else if (is_it_tolower)
17895049Sis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
17905049Sis 			else
17915049Sis 				u8s2[0] = *s2;
17925049Sis 			u8s2[1] = '\0';
17935049Sis 			sz2 = 1;
17945049Sis 			s2++;
17955049Sis 		} else {
17965049Sis 			state = U8_STATE_START;
17975049Sis 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
17985049Sis 			    is_it_toupper, is_it_tolower,
17995049Sis 			    canonical_decomposition,
18005049Sis 			    compatibility_decomposition,
1801*7012Sis 			    canonical_composition, errnum, &state);
18025049Sis 		}
18035049Sis 
18045049Sis 		/*
18055049Sis 		 * Now compare the two characters. If they are the same,
18065049Sis 		 * we move on to the next character sequences.
18075049Sis 		 */
18085049Sis 		if (sz1 == 1 && sz2 == 1) {
18095049Sis 			if (*u8s1 > *u8s2)
18105049Sis 				return (1);
18115049Sis 			if (*u8s1 < *u8s2)
18125049Sis 				return (-1);
18135049Sis 		} else {
18145049Sis 			result = strcmp((const char *)u8s1, (const char *)u8s2);
18155049Sis 			if (result != 0)
18165049Sis 				return (result);
18175049Sis 		}
18185049Sis 	}
18195049Sis 
18205049Sis 	/*
18215049Sis 	 * We compared until the end of either or both strings.
18225049Sis 	 *
18235049Sis 	 * If we reached to or went over the ends for the both, that means
18245049Sis 	 * they are the same.
18255049Sis 	 *
18265049Sis 	 * If we reached only one end, that means the other string has
18275049Sis 	 * something which then can be used to determine the return value.
18285049Sis 	 */
18295049Sis 	if (s1 >= s1last) {
18305049Sis 		if (s2 >= s2last)
18315049Sis 			return (0);
18325049Sis 		return (-1);
18335049Sis 	}
18345049Sis 	return (1);
18355049Sis }
18365049Sis 
18375049Sis /*
18385049Sis  * The u8_strcmp() function compares two UTF-8 strings quite similar to
18395049Sis  * the strcmp(). For the comparison, however, Unicode Normalization specific
18405049Sis  * equivalency and Unicode simple case conversion mappings based equivalency
18415049Sis  * can be requested and checked against.
18425049Sis  */
18435049Sis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18445049Sis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845*7012Sis 		int *errnum)
18465049Sis {
18475049Sis 	int f;
18485049Sis 	size_t n1;
18495049Sis 	size_t n2;
18505049Sis 
1851*7012Sis 	*errnum = 0;
18525049Sis 
18535049Sis 	/*
18545049Sis 	 * Check on the requested Unicode version, case conversion, and
18555049Sis 	 * normalization flag values.
18565049Sis 	 */
18575049Sis 
18585049Sis 	if (uv > U8_UNICODE_LATEST) {
1859*7012Sis 		*errnum = ERANGE;
18605049Sis 		uv = U8_UNICODE_LATEST;
18615049Sis 	}
18625049Sis 
18635049Sis 	if (flag == 0) {
18645049Sis 		flag = U8_STRCMP_CS;
18655049Sis 	} else {
18665049Sis 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18675049Sis 		    U8_STRCMP_CI_LOWER);
18685049Sis 		if (f == 0) {
18695049Sis 			flag |= U8_STRCMP_CS;
18705049Sis 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18715049Sis 		    f != U8_STRCMP_CI_LOWER) {
1872*7012Sis 			*errnum = EBADF;
18735049Sis 			flag = U8_STRCMP_CS;
18745049Sis 		}
18755049Sis 
18765049Sis 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18775049Sis 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18785049Sis 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879*7012Sis 			*errnum = EBADF;
18805049Sis 			flag = U8_STRCMP_CS;
18815049Sis 		}
18825049Sis 	}
18835049Sis 
18845049Sis 	if (flag == U8_STRCMP_CS) {
18855049Sis 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18865049Sis 	}
18875049Sis 
18885049Sis 	n1 = strlen(s1);
18895049Sis 	n2 = strlen(s2);
18905049Sis 	if (n != 0) {
18915049Sis 		if (n < n1)
18925049Sis 			n1 = n;
18935049Sis 		if (n < n2)
18945049Sis 			n2 = n;
18955049Sis 	}
18965049Sis 
18975049Sis 	/*
18985049Sis 	 * Simple case conversion can be done much faster and so we do
18995049Sis 	 * them separately here.
19005049Sis 	 */
19015049Sis 	if (flag == U8_STRCMP_CI_UPPER) {
19025049Sis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903*7012Sis 		    n1, n2, B_TRUE, errnum));
19045049Sis 	} else if (flag == U8_STRCMP_CI_LOWER) {
19055049Sis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906*7012Sis 		    n1, n2, B_FALSE, errnum));
19075049Sis 	}
19085049Sis 
19095049Sis 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910*7012Sis 	    flag, errnum));
19115049Sis }
19125049Sis 
19135049Sis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19145049Sis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915*7012Sis 	int flag, size_t unicode_version, int *errnum)
19165049Sis {
19175049Sis 	int f;
19185049Sis 	int sz;
19195049Sis 	uchar_t *ib;
19205049Sis 	uchar_t *ibtail;
19215049Sis 	uchar_t *ob;
19225049Sis 	uchar_t *obtail;
19235049Sis 	boolean_t do_not_ignore_null;
19245049Sis 	boolean_t do_not_ignore_invalid;
19255049Sis 	boolean_t is_it_toupper;
19265049Sis 	boolean_t is_it_tolower;
19275049Sis 	boolean_t canonical_decomposition;
19285049Sis 	boolean_t compatibility_decomposition;
19295049Sis 	boolean_t canonical_composition;
19305049Sis 	size_t ret_val;
19315049Sis 	size_t i;
19325049Sis 	size_t j;
19335049Sis 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19345049Sis 	u8_normalization_states_t state;
19355049Sis 
19365049Sis 	if (unicode_version > U8_UNICODE_LATEST) {
1937*7012Sis 		*errnum = ERANGE;
19385049Sis 		return ((size_t)-1);
19395049Sis 	}
19405049Sis 
19415049Sis 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19425049Sis 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943*7012Sis 		*errnum = EBADF;
19445049Sis 		return ((size_t)-1);
19455049Sis 	}
19465049Sis 
19475049Sis 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19485049Sis 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19495049Sis 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950*7012Sis 		*errnum = EBADF;
19515049Sis 		return ((size_t)-1);
19525049Sis 	}
19535049Sis 
19545049Sis 	if (inarray == NULL || *inlen == 0)
19555049Sis 		return (0);
19565049Sis 
19575049Sis 	if (outarray == NULL) {
1958*7012Sis 		*errnum = E2BIG;
19595049Sis 		return ((size_t)-1);
19605049Sis 	}
19615049Sis 
19625049Sis 	ib = (uchar_t *)inarray;
19635049Sis 	ob = (uchar_t *)outarray;
19645049Sis 	ibtail = ib + *inlen;
19655049Sis 	obtail = ob + *outlen;
19665049Sis 
19675049Sis 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19685049Sis 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19695049Sis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19705049Sis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19715049Sis 
19725049Sis 	ret_val = 0;
19735049Sis 
19745049Sis 	/*
19755049Sis 	 * If we don't have a normalization flag set, we do the simple case
19765049Sis 	 * conversion based text preparation separately below. Text
19775049Sis 	 * preparation involving Normalization will be done in the false task
19785049Sis 	 * block, again, separately since it will take much more time and
19795049Sis 	 * resource than doing simple case conversions.
19805049Sis 	 */
19815049Sis 	if (f == 0) {
19825049Sis 		while (ib < ibtail) {
19835049Sis 			if (*ib == '\0' && do_not_ignore_null)
19845049Sis 				break;
19855049Sis 
19865049Sis 			sz = u8_number_of_bytes[*ib];
19875049Sis 
19885049Sis 			if (sz < 0) {
19895049Sis 				if (do_not_ignore_invalid) {
1990*7012Sis 					*errnum = EILSEQ;
19915049Sis 					ret_val = (size_t)-1;
19925049Sis 					break;
19935049Sis 				}
19945049Sis 
19955049Sis 				sz = 1;
19965049Sis 				ret_val++;
19975049Sis 			}
19985049Sis 
19995049Sis 			if (sz == 1) {
20005049Sis 				if (ob >= obtail) {
2001*7012Sis 					*errnum = E2BIG;
20025049Sis 					ret_val = (size_t)-1;
20035049Sis 					break;
20045049Sis 				}
20055049Sis 
20065049Sis 				if (is_it_toupper)
20075049Sis 					*ob = U8_ASCII_TOUPPER(*ib);
20085049Sis 				else if (is_it_tolower)
20095049Sis 					*ob = U8_ASCII_TOLOWER(*ib);
20105049Sis 				else
20115049Sis 					*ob = *ib;
20125049Sis 				ib++;
20135049Sis 				ob++;
20145049Sis 			} else if ((ib + sz) > ibtail) {
20155049Sis 				if (do_not_ignore_invalid) {
2016*7012Sis 					*errnum = EINVAL;
20175049Sis 					ret_val = (size_t)-1;
20185049Sis 					break;
20195049Sis 				}
20205049Sis 
20215049Sis 				if ((obtail - ob) < (ibtail - ib)) {
2022*7012Sis 					*errnum = E2BIG;
20235049Sis 					ret_val = (size_t)-1;
20245049Sis 					break;
20255049Sis 				}
20265049Sis 
20275049Sis 				/*
20285049Sis 				 * We treat the remaining incomplete character
20295049Sis 				 * bytes as a character.
20305049Sis 				 */
20315049Sis 				ret_val++;
20325049Sis 
20335049Sis 				while (ib < ibtail)
20345049Sis 					*ob++ = *ib++;
20355049Sis 			} else {
20365049Sis 				if (is_it_toupper || is_it_tolower) {
20375049Sis 					i = do_case_conv(unicode_version, u8s,
20385049Sis 					    ib, sz, is_it_toupper);
20395049Sis 
20405049Sis 					if ((obtail - ob) < i) {
2041*7012Sis 						*errnum = E2BIG;
20425049Sis 						ret_val = (size_t)-1;
20435049Sis 						break;
20445049Sis 					}
20455049Sis 
20465049Sis 					ib += sz;
20475049Sis 
20485049Sis 					for (sz = 0; sz < i; sz++)
20495049Sis 						*ob++ = u8s[sz];
20505049Sis 				} else {
20515049Sis 					if ((obtail - ob) < sz) {
2052*7012Sis 						*errnum = E2BIG;
20535049Sis 						ret_val = (size_t)-1;
20545049Sis 						break;
20555049Sis 					}
20565049Sis 
20575049Sis 					for (i = 0; i < sz; i++)
20585049Sis 						*ob++ = *ib++;
20595049Sis 				}
20605049Sis 			}
20615049Sis 		}
20625049Sis 	} else {
20635049Sis 		canonical_decomposition = flag & U8_CANON_DECOMP;
20645049Sis 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20655049Sis 		canonical_composition = flag & U8_CANON_COMP;
20665049Sis 
20675049Sis 		while (ib < ibtail) {
20685049Sis 			if (*ib == '\0' && do_not_ignore_null)
20695049Sis 				break;
20705049Sis 
20715049Sis 			/*
20725049Sis 			 * If the current character is a 7-bit ASCII
20735049Sis 			 * character and it is the last character, or,
20745049Sis 			 * if the current character is a 7-bit ASCII
20755049Sis 			 * character and the next character is also a 7-bit
20765049Sis 			 * ASCII character, then, we copy over this
20775049Sis 			 * character without going through collect_a_seq().
20785049Sis 			 *
20795049Sis 			 * In any other cases, we need to look further with
20805049Sis 			 * the collect_a_seq() function.
20815049Sis 			 */
20825049Sis 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20835049Sis 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20845049Sis 				if (ob >= obtail) {
2085*7012Sis 					*errnum = E2BIG;
20865049Sis 					ret_val = (size_t)-1;
20875049Sis 					break;
20885049Sis 				}
20895049Sis 
20905049Sis 				if (is_it_toupper)
20915049Sis 					*ob = U8_ASCII_TOUPPER(*ib);
20925049Sis 				else if (is_it_tolower)
20935049Sis 					*ob = U8_ASCII_TOLOWER(*ib);
20945049Sis 				else
20955049Sis 					*ob = *ib;
20965049Sis 				ib++;
20975049Sis 				ob++;
20985049Sis 			} else {
2099*7012Sis 				*errnum = 0;
21005049Sis 				state = U8_STATE_START;
21015049Sis 
21025049Sis 				j = collect_a_seq(unicode_version, u8s,
21035049Sis 				    &ib, ibtail,
21045049Sis 				    is_it_toupper,
21055049Sis 				    is_it_tolower,
21065049Sis 				    canonical_decomposition,
21075049Sis 				    compatibility_decomposition,
21085049Sis 				    canonical_composition,
2109*7012Sis 				    errnum, &state);
21105049Sis 
2111*7012Sis 				if (*errnum && do_not_ignore_invalid) {
21125049Sis 					ret_val = (size_t)-1;
21135049Sis 					break;
21145049Sis 				}
21155049Sis 
21165049Sis 				if ((obtail - ob) < j) {
2117*7012Sis 					*errnum = E2BIG;
21185049Sis 					ret_val = (size_t)-1;
21195049Sis 					break;
21205049Sis 				}
21215049Sis 
21225049Sis 				for (i = 0; i < j; i++)
21235049Sis 					*ob++ = u8s[i];
21245049Sis 			}
21255049Sis 		}
21265049Sis 	}
21275049Sis 
21285049Sis 	*inlen = ibtail - ib;
21295049Sis 	*outlen = obtail - ob;
21305049Sis 
21315049Sis 	return (ret_val);
21325049Sis }
2133