xref: /illumos-gate/usr/src/common/unicode/u8_textprep.c (revision f137b22e734e85642da3e56e8b94da3f5f027c73)
14703203dSis /*
24703203dSis  * CDDL HEADER START
34703203dSis  *
44703203dSis  * The contents of this file are subject to the terms of the
54703203dSis  * Common Development and Distribution License (the "License").
64703203dSis  * You may not use this file except in compliance with the License.
74703203dSis  *
84703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis  * or http://www.opensolaris.org/os/licensing.
104703203dSis  * See the License for the specific language governing permissions
114703203dSis  * and limitations under the License.
124703203dSis  *
134703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis  * If applicable, add the following below this CDDL HEADER, with the
164703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis  *
194703203dSis  * CDDL HEADER END
204703203dSis  */
214703203dSis /*
2285bb5f1dSis  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
234703203dSis  * Use is subject to license terms.
244703203dSis  */
254703203dSis 
26*f137b22eSDan McDonald /*
27*f137b22eSDan McDonald  * Copyright 2022 MNX Cloud, Inc.
28*f137b22eSDan McDonald  */
294703203dSis 
304703203dSis 
314703203dSis /*
324703203dSis  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
334703203dSis  *
344703203dSis  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
354703203dSis  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
364703203dSis  * the section 3C man pages.
374703203dSis  * Interface stability: Committed.
384703203dSis  */
394703203dSis 
404703203dSis #include <sys/types.h>
414703203dSis #ifdef	_KERNEL
424703203dSis #include <sys/param.h>
434703203dSis #include <sys/sysmacros.h>
444703203dSis #include <sys/systm.h>
454703203dSis #include <sys/debug.h>
464703203dSis #include <sys/kmem.h>
474703203dSis #include <sys/ddi.h>
484703203dSis #include <sys/sunddi.h>
494703203dSis #else
504703203dSis #include <sys/u8_textprep.h>
514703203dSis #include <strings.h>
524703203dSis #endif	/* _KERNEL */
534703203dSis #include <sys/byteorder.h>
544703203dSis #include <sys/errno.h>
554703203dSis #include <sys/u8_textprep_data.h>
564703203dSis 
574703203dSis 
584703203dSis /* The maximum possible number of bytes in a UTF-8 character. */
594703203dSis #define	U8_MB_CUR_MAX			(4)
604703203dSis 
614703203dSis /*
624703203dSis  * The maximum number of bytes needed for a UTF-8 character to cover
634703203dSis  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
644703203dSis  */
654703203dSis #define	U8_MAX_BYTES_UCS2		(3)
664703203dSis 
674703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */
684703203dSis #define	U8_STREAM_SAFE_TEXT_MAX		(128)
694703203dSis 
704703203dSis /*
714703203dSis  * The maximum number of characters in a combining/conjoining sequence and
724703203dSis  * the actual upperbound limit of a combining/conjoining sequence.
734703203dSis  */
744703203dSis #define	U8_MAX_CHARS_A_SEQ		(32)
754703203dSis #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
764703203dSis 
774703203dSis /* The combining class value for Starter. */
784703203dSis #define	U8_COMBINING_CLASS_STARTER	(0)
794703203dSis 
804703203dSis /*
814703203dSis  * Some Hangul related macros at below.
824703203dSis  *
834703203dSis  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
844703203dSis  * Vowels, and optional Trailing consonants in Unicode scalar values.
854703203dSis  *
864703203dSis  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
874703203dSis  * the actual U+11A8. This is due to that the trailing consonant is optional
884703203dSis  * and thus we are doing a pre-calculation of subtracting one.
894703203dSis  *
904703203dSis  * Each of 19 modern leading consonants has total 588 possible syllables since
914703203dSis  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
924703203dSis  * no trailing consonant case, i.e., 21 x 28 = 588.
934703203dSis  *
944703203dSis  * We also have bunch of Hangul related macros at below. Please bear in mind
954703203dSis  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
964703203dSis  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
974703203dSis  * Jamo; it just guarantee that it will be most likely.
984703203dSis  */
994703203dSis #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
1004703203dSis #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
1014703203dSis 
1024703203dSis #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
1034703203dSis #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
1044703203dSis #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
1054703203dSis #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
1064703203dSis #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
1074703203dSis #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
1084703203dSis 
1094703203dSis #define	U8_HANGUL_V_COUNT		(21)
1104703203dSis #define	U8_HANGUL_VT_COUNT		(588)
1114703203dSis #define	U8_HANGUL_T_COUNT		(28)
1124703203dSis 
1134703203dSis #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
1144703203dSis 
1154703203dSis #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1164703203dSis 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1174703203dSis 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1184703203dSis 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1194703203dSis 
1204703203dSis #define	U8_HANGUL_JAMO_L(u) \
1214703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1224703203dSis 
1234703203dSis #define	U8_HANGUL_JAMO_V(u) \
1244703203dSis 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1254703203dSis 
1264703203dSis #define	U8_HANGUL_JAMO_T(u) \
1274703203dSis 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1284703203dSis 
1294703203dSis #define	U8_HANGUL_JAMO(u) \
1304703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1314703203dSis 
1324703203dSis #define	U8_HANGUL_SYLLABLE(u) \
1334703203dSis 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1344703203dSis 
1354703203dSis #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
1364703203dSis 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1374703203dSis 
1384703203dSis #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1394703203dSis 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1404703203dSis 
1414703203dSis /* The types of decomposition mappings. */
1424703203dSis #define	U8_DECOMP_BOTH			(0xF5U)
1434703203dSis #define	U8_DECOMP_CANONICAL		(0xF6U)
1444703203dSis 
1454703203dSis /* The indicator for 16-bit table. */
1464703203dSis #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
1474703203dSis 
1484703203dSis /* The following are some convenience macros. */
1494703203dSis #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1504703203dSis 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1514703203dSis 		(uint32_t)(b3) & 0x3F;
1524703203dSis 
1534703203dSis #define	U8_SIMPLE_SWAP(a, b, t) \
1544703203dSis 	(t) = (a); \
1554703203dSis 	(a) = (b); \
1564703203dSis 	(b) = (t);
1574703203dSis 
1584703203dSis #define	U8_ASCII_TOUPPER(c) \
1594703203dSis 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1604703203dSis 
1614703203dSis #define	U8_ASCII_TOLOWER(c) \
1624703203dSis 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1634703203dSis 
1644703203dSis #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
1654703203dSis /*
1664703203dSis  * The following macro assumes that the two characters that are to be
1674703203dSis  * swapped are adjacent to each other and 'a' comes before 'b'.
1684703203dSis  *
1694703203dSis  * If the assumptions are not met, then, the macro will fail.
1704703203dSis  */
1714703203dSis #define	U8_SWAP_COMB_MARKS(a, b) \
1724703203dSis 	for (k = 0; k < disp[(a)]; k++) \
1734703203dSis 		u8t[k] = u8s[start[(a)] + k]; \
1744703203dSis 	for (k = 0; k < disp[(b)]; k++) \
1754703203dSis 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1764703203dSis 	start[(b)] = start[(a)] + disp[(b)]; \
1774703203dSis 	for (k = 0; k < disp[(a)]; k++) \
1784703203dSis 		u8s[start[(b)] + k] = u8t[k]; \
1794703203dSis 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1804703203dSis 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1814703203dSis 
1824703203dSis /* The possible states during normalization. */
1834703203dSis typedef enum {
1844703203dSis 	U8_STATE_START = 0,
1854703203dSis 	U8_STATE_HANGUL_L = 1,
1864703203dSis 	U8_STATE_HANGUL_LV = 2,
1874703203dSis 	U8_STATE_HANGUL_LVT = 3,
1884703203dSis 	U8_STATE_HANGUL_V = 4,
1894703203dSis 	U8_STATE_HANGUL_T = 5,
1904703203dSis 	U8_STATE_COMBINING_MARK = 6
1914703203dSis } u8_normalization_states_t;
1924703203dSis 
1934703203dSis /*
1944703203dSis  * The three vectors at below are used to check bytes of a given UTF-8
1954703203dSis  * character are valid and not containing any malformed byte values.
1964703203dSis  *
1974703203dSis  * We used to have a quite relaxed UTF-8 binary representation but then there
1984703203dSis  * was some security related issues and so the Unicode Consortium defined
1994703203dSis  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
2004703203dSis  * one more time at the Unicode 3.2. The following three tables are based on
2014703203dSis  * that.
2024703203dSis  */
2034703203dSis 
2044703203dSis #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
2054703203dSis 
2064703203dSis #define	I_				U8_ILLEGAL_CHAR
2074703203dSis #define	O_				U8_OUT_OF_RANGE_CHAR
2084703203dSis 
2094703203dSis const int8_t u8_number_of_bytes[0x100] = {
2104703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2114703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2124703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2134703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2144703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2154703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2164703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2174703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2184703203dSis 
2194703203dSis /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
2204703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2214703203dSis 
2224703203dSis /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
2234703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2244703203dSis 
2254703203dSis /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
2264703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2274703203dSis 
2284703203dSis /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
2294703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2304703203dSis 
2314703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
2324703203dSis 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2334703203dSis 
2344703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
2354703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2364703203dSis 
2374703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
2384703203dSis 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2394703203dSis 
2404703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
2414703203dSis 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2424703203dSis };
2434703203dSis 
2444703203dSis #undef	I_
2454703203dSis #undef	O_
2464703203dSis 
2474703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2484703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2494703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2504703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2514703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2524703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2534703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2544703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2554703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2564703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2574703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2584703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2594703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2604703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2614703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2624703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2634703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2644703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2654703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2664703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2674703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2684703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2694703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2704703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2714703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2724703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
2734703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2744703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
2754703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2764703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
2774703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2784703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
2794703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2804703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
2814703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2824703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
2834703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2844703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
2854703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2864703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2874703203dSis };
2884703203dSis 
2894703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2904703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2914703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2924703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2934703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2944703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2954703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2964703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2974703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2984703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2994703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3004703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3014703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3024703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3034703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3044703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3054703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3064703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3074703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3084703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3094703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3104703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3114703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3124703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3134703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3144703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
3154703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3164703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
3174703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3184703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
3194703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3204703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
3214703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3224703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
3234703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3244703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
3254703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3264703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
3274703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
3284703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3294703203dSis };
3304703203dSis 
3314703203dSis 
3324703203dSis /*
3334703203dSis  * The u8_validate() validates on the given UTF-8 character string and
3344703203dSis  * calculate the byte length. It is quite similar to mblen(3C) except that
3354703203dSis  * this will validate against the list of characters if required and
3364703203dSis  * specific to UTF-8 and Unicode.
3374703203dSis  */
3384703203dSis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)33985bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3404703203dSis {
3414703203dSis 	uchar_t *ib;
3424703203dSis 	uchar_t *ibtail;
3434703203dSis 	uchar_t **p;
3444703203dSis 	uchar_t *s1;
3454703203dSis 	uchar_t *s2;
3464703203dSis 	uchar_t f;
3474703203dSis 	int sz;
3484703203dSis 	size_t i;
3494703203dSis 	int ret_val;
3504703203dSis 	boolean_t second;
3514703203dSis 	boolean_t no_need_to_validate_entire;
3524703203dSis 	boolean_t check_additional;
3534703203dSis 	boolean_t validate_ucs2_range_only;
3544703203dSis 
3554703203dSis 	if (! u8str)
3564703203dSis 		return (0);
3574703203dSis 
3584703203dSis 	ib = (uchar_t *)u8str;
3594703203dSis 	ibtail = ib + n;
3604703203dSis 
3614703203dSis 	ret_val = 0;
3624703203dSis 
3634703203dSis 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3644703203dSis 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3654703203dSis 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3664703203dSis 
3674703203dSis 	while (ib < ibtail) {
3684703203dSis 		/*
3694703203dSis 		 * The first byte of a UTF-8 character tells how many
3704703203dSis 		 * bytes will follow for the character. If the first byte
3714703203dSis 		 * is an illegal byte value or out of range value, we just
3724703203dSis 		 * return -1 with an appropriate error number.
3734703203dSis 		 */
3744703203dSis 		sz = u8_number_of_bytes[*ib];
3754703203dSis 		if (sz == U8_ILLEGAL_CHAR) {
37685bb5f1dSis 			*errnum = EILSEQ;
3774703203dSis 			return (-1);
3784703203dSis 		}
3794703203dSis 
3804703203dSis 		if (sz == U8_OUT_OF_RANGE_CHAR ||
3814703203dSis 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
38285bb5f1dSis 			*errnum = ERANGE;
3834703203dSis 			return (-1);
3844703203dSis 		}
3854703203dSis 
3864703203dSis 		/*
3874703203dSis 		 * If we don't have enough bytes to check on, that's also
3884703203dSis 		 * an error. As you can see, we give illegal byte sequence
3894703203dSis 		 * checking higher priority then EINVAL cases.
3904703203dSis 		 */
3914703203dSis 		if ((ibtail - ib) < sz) {
39285bb5f1dSis 			*errnum = EINVAL;
3934703203dSis 			return (-1);
3944703203dSis 		}
3954703203dSis 
3964703203dSis 		if (sz == 1) {
3974703203dSis 			ib++;
3984703203dSis 			ret_val++;
3994703203dSis 		} else {
4004703203dSis 			/*
4014703203dSis 			 * Check on the multi-byte UTF-8 character. For more
4024703203dSis 			 * details on this, see comment added for the used
4034703203dSis 			 * data structures at the beginning of the file.
4044703203dSis 			 */
4054703203dSis 			f = *ib++;
4064703203dSis 			ret_val++;
4074703203dSis 			second = B_TRUE;
4084703203dSis 			for (i = 1; i < sz; i++) {
4094703203dSis 				if (second) {
4104703203dSis 					if (*ib < u8_valid_min_2nd_byte[f] ||
4114703203dSis 					    *ib > u8_valid_max_2nd_byte[f]) {
41285bb5f1dSis 						*errnum = EILSEQ;
4134703203dSis 						return (-1);
4144703203dSis 					}
4154703203dSis 					second = B_FALSE;
4164703203dSis 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
41785bb5f1dSis 					*errnum = EILSEQ;
4184703203dSis 					return (-1);
4194703203dSis 				}
4204703203dSis 				ib++;
4214703203dSis 				ret_val++;
4224703203dSis 			}
4234703203dSis 		}
4244703203dSis 
4254703203dSis 		if (check_additional) {
4264703203dSis 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4274703203dSis 				s1 = ib - sz;
4284703203dSis 				s2 = p[i];
4294703203dSis 				while (s1 < ib) {
4304703203dSis 					if (*s1 != *s2 || *s2 == '\0')
4314703203dSis 						break;
4324703203dSis 					s1++;
4334703203dSis 					s2++;
4344703203dSis 				}
4354703203dSis 
4364703203dSis 				if (s1 >= ib && *s2 == '\0') {
43785bb5f1dSis 					*errnum = EBADF;
4384703203dSis 					return (-1);
4394703203dSis 				}
4404703203dSis 			}
4414703203dSis 		}
4424703203dSis 
4434703203dSis 		if (no_need_to_validate_entire)
4444703203dSis 			break;
4454703203dSis 	}
4464703203dSis 
4474703203dSis 	return (ret_val);
4484703203dSis }
4494703203dSis 
4504703203dSis /*
4514703203dSis  * The do_case_conv() looks at the mapping tables and returns found
4524703203dSis  * bytes if any. If not found, the input bytes are returned. The function
4534703203dSis  * always terminate the return bytes with a null character assuming that
4544703203dSis  * there are plenty of room to do so.
4554703203dSis  *
4564703203dSis  * The case conversions are simple case conversions mapping a character to
4574703203dSis  * another character as specified in the Unicode data. The byte size of
4584703203dSis  * the mapped character could be different from that of the input character.
4594703203dSis  *
4604703203dSis  * The return value is the byte length of the returned character excluding
4614703203dSis  * the terminating null byte.
4624703203dSis  */
4634703203dSis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4644703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4654703203dSis {
4664703203dSis 	size_t i;
4674703203dSis 	uint16_t b1 = 0;
4684703203dSis 	uint16_t b2 = 0;
4694703203dSis 	uint16_t b3 = 0;
4704703203dSis 	uint16_t b3_tbl;
4714703203dSis 	uint16_t b3_base;
4724703203dSis 	uint16_t b4 = 0;
4734703203dSis 	size_t start_id;
4744703203dSis 	size_t end_id;
4754703203dSis 
4764703203dSis 	/*
4774703203dSis 	 * At this point, the only possible values for sz are 2, 3, and 4.
4784703203dSis 	 * The u8s should point to a vector that is well beyond the size of
4794703203dSis 	 * 5 bytes.
4804703203dSis 	 */
4814703203dSis 	if (sz == 2) {
4824703203dSis 		b3 = u8s[0] = s[0];
4834703203dSis 		b4 = u8s[1] = s[1];
4844703203dSis 	} else if (sz == 3) {
4854703203dSis 		b2 = u8s[0] = s[0];
4864703203dSis 		b3 = u8s[1] = s[1];
4874703203dSis 		b4 = u8s[2] = s[2];
4884703203dSis 	} else if (sz == 4) {
4894703203dSis 		b1 = u8s[0] = s[0];
4904703203dSis 		b2 = u8s[1] = s[1];
4914703203dSis 		b3 = u8s[2] = s[2];
4924703203dSis 		b4 = u8s[3] = s[3];
4934703203dSis 	} else {
4944703203dSis 		/* This is not possible but just in case as a fallback. */
4954703203dSis 		if (is_it_toupper)
4964703203dSis 			*u8s = U8_ASCII_TOUPPER(*s);
4974703203dSis 		else
4984703203dSis 			*u8s = U8_ASCII_TOLOWER(*s);
4994703203dSis 		u8s[1] = '\0';
5004703203dSis 
5014703203dSis 		return (1);
5024703203dSis 	}
5034703203dSis 	u8s[sz] = '\0';
5044703203dSis 
5054703203dSis 	/*
5064703203dSis 	 * Let's find out if we have a corresponding character.
5074703203dSis 	 */
5084703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
5094703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5104703203dSis 		return ((size_t)sz);
5114703203dSis 
5124703203dSis 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
5134703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5144703203dSis 		return ((size_t)sz);
5154703203dSis 
5164703203dSis 	if (is_it_toupper) {
5174703203dSis 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5184703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5194703203dSis 			return ((size_t)sz);
5204703203dSis 
5214703203dSis 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5224703203dSis 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5234703203dSis 
5244703203dSis 		/* Either there is no match or an error at the table. */
5254703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5264703203dSis 			return ((size_t)sz);
5274703203dSis 
5284703203dSis 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5294703203dSis 
5304703203dSis 		for (i = 0; start_id < end_id; start_id++)
5314703203dSis 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5324703203dSis 	} else {
5334703203dSis 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5344703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5354703203dSis 			return ((size_t)sz);
5364703203dSis 
5374703203dSis 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5384703203dSis 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5394703203dSis 
5404703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5414703203dSis 			return ((size_t)sz);
5424703203dSis 
5434703203dSis 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5444703203dSis 
5454703203dSis 		for (i = 0; start_id < end_id; start_id++)
5464703203dSis 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5474703203dSis 	}
5484703203dSis 
5494703203dSis 	/*
5504703203dSis 	 * If i is still zero, that means there is no corresponding character.
5514703203dSis 	 */
5524703203dSis 	if (i == 0)
5534703203dSis 		return ((size_t)sz);
5544703203dSis 
5554703203dSis 	u8s[i] = '\0';
5564703203dSis 
5574703203dSis 	return (i);
5584703203dSis }
5594703203dSis 
5604703203dSis /*
5614703203dSis  * The do_case_compare() function compares the two input strings, s1 and s2,
5624703203dSis  * one character at a time doing case conversions if applicable and return
5634703203dSis  * the comparison result as like strcmp().
5644703203dSis  *
5654703203dSis  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5664703203dSis  * we treat the 7-bit ASCII characters as a special case trying to yield
5674703203dSis  * faster processing time.
5684703203dSis  */
5694703203dSis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5704703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
57185bb5f1dSis     size_t n2, boolean_t is_it_toupper, int *errnum)
5724703203dSis {
5734703203dSis 	int f;
5744703203dSis 	int sz1;
5754703203dSis 	int sz2;
5764703203dSis 	size_t j;
5774703203dSis 	size_t i1;
5784703203dSis 	size_t i2;
5794703203dSis 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
5804703203dSis 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
5814703203dSis 
5824703203dSis 	i1 = i2 = 0;
5834703203dSis 	while (i1 < n1 && i2 < n2) {
5844703203dSis 		/*
5854703203dSis 		 * Find out what would be the byte length for this UTF-8
5864703203dSis 		 * character at string s1 and also find out if this is
5874703203dSis 		 * an illegal start byte or not and if so, issue a proper
58885bb5f1dSis 		 * error number and yet treat this byte as a character.
5894703203dSis 		 */
5904703203dSis 		sz1 = u8_number_of_bytes[*s1];
5914703203dSis 		if (sz1 < 0) {
59285bb5f1dSis 			*errnum = EILSEQ;
5934703203dSis 			sz1 = 1;
5944703203dSis 		}
5954703203dSis 
5964703203dSis 		/*
5974703203dSis 		 * For 7-bit ASCII characters mainly, we do a quick case
5984703203dSis 		 * conversion right at here.
5994703203dSis 		 *
6004703203dSis 		 * If we don't have enough bytes for this character, issue
6014703203dSis 		 * an EINVAL error and use what are available.
6024703203dSis 		 *
6034703203dSis 		 * If we have enough bytes, find out if there is
6044703203dSis 		 * a corresponding uppercase character and if so, copy over
6054703203dSis 		 * the bytes for a comparison later. If there is no
6064703203dSis 		 * corresponding uppercase character, then, use what we have
6074703203dSis 		 * for the comparison.
6084703203dSis 		 */
6094703203dSis 		if (sz1 == 1) {
6104703203dSis 			if (is_it_toupper)
6114703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
6124703203dSis 			else
6134703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
6144703203dSis 			s1++;
6154703203dSis 			u8s1[1] = '\0';
6164703203dSis 		} else if ((i1 + sz1) > n1) {
61785bb5f1dSis 			*errnum = EINVAL;
6184703203dSis 			for (j = 0; (i1 + j) < n1; )
6194703203dSis 				u8s1[j++] = *s1++;
6204703203dSis 			u8s1[j] = '\0';
6214703203dSis 		} else {
6224703203dSis 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6234703203dSis 			s1 += sz1;
6244703203dSis 		}
6254703203dSis 
6264703203dSis 		/* Do the same for the string s2. */
6274703203dSis 		sz2 = u8_number_of_bytes[*s2];
6284703203dSis 		if (sz2 < 0) {
62985bb5f1dSis 			*errnum = EILSEQ;
6304703203dSis 			sz2 = 1;
6314703203dSis 		}
6324703203dSis 
6334703203dSis 		if (sz2 == 1) {
6344703203dSis 			if (is_it_toupper)
6354703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
6364703203dSis 			else
6374703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
6384703203dSis 			s2++;
6394703203dSis 			u8s2[1] = '\0';
6404703203dSis 		} else if ((i2 + sz2) > n2) {
64185bb5f1dSis 			*errnum = EINVAL;
6424703203dSis 			for (j = 0; (i2 + j) < n2; )
6434703203dSis 				u8s2[j++] = *s2++;
6444703203dSis 			u8s2[j] = '\0';
6454703203dSis 		} else {
6464703203dSis 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6474703203dSis 			s2 += sz2;
6484703203dSis 		}
6494703203dSis 
6504703203dSis 		/* Now compare the two characters. */
6514703203dSis 		if (sz1 == 1 && sz2 == 1) {
6524703203dSis 			if (*u8s1 > *u8s2)
6534703203dSis 				return (1);
6544703203dSis 			if (*u8s1 < *u8s2)
6554703203dSis 				return (-1);
6564703203dSis 		} else {
6574703203dSis 			f = strcmp((const char *)u8s1, (const char *)u8s2);
6584703203dSis 			if (f != 0)
6594703203dSis 				return (f);
6604703203dSis 		}
6614703203dSis 
6624703203dSis 		/*
6634703203dSis 		 * They were the same. Let's move on to the next
6644703203dSis 		 * characters then.
6654703203dSis 		 */
6664703203dSis 		i1 += sz1;
6674703203dSis 		i2 += sz2;
6684703203dSis 	}
6694703203dSis 
6704703203dSis 	/*
6714703203dSis 	 * We compared until the end of either or both strings.
6724703203dSis 	 *
6734703203dSis 	 * If we reached to or went over the ends for the both, that means
6744703203dSis 	 * they are the same.
6754703203dSis 	 *
6764703203dSis 	 * If we reached only one of the two ends, that means the other string
6774703203dSis 	 * has something which then the fact can be used to determine
6784703203dSis 	 * the return value.
6794703203dSis 	 */
6804703203dSis 	if (i1 >= n1) {
6814703203dSis 		if (i2 >= n2)
6824703203dSis 			return (0);
6834703203dSis 		return (-1);
6844703203dSis 	}
6854703203dSis 	return (1);
6864703203dSis }
6874703203dSis 
6884703203dSis /*
6894703203dSis  * The combining_class() function checks on the given bytes and find out
6904703203dSis  * the corresponding Unicode combining class value. The return value 0 means
6914703203dSis  * it is a Starter. Any illegal UTF-8 character will also be treated as
6924703203dSis  * a Starter.
6934703203dSis  */
6944703203dSis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6954703203dSis combining_class(size_t uv, uchar_t *s, size_t sz)
6964703203dSis {
6974703203dSis 	uint16_t b1 = 0;
6984703203dSis 	uint16_t b2 = 0;
6994703203dSis 	uint16_t b3 = 0;
7004703203dSis 	uint16_t b4 = 0;
7014703203dSis 
7024703203dSis 	if (sz == 1 || sz > 4)
7034703203dSis 		return (0);
7044703203dSis 
7054703203dSis 	if (sz == 2) {
7064703203dSis 		b3 = s[0];
7074703203dSis 		b4 = s[1];
7084703203dSis 	} else if (sz == 3) {
7094703203dSis 		b2 = s[0];
7104703203dSis 		b3 = s[1];
7114703203dSis 		b4 = s[2];
7124703203dSis 	} else if (sz == 4) {
7134703203dSis 		b1 = s[0];
7144703203dSis 		b2 = s[1];
7154703203dSis 		b3 = s[2];
7164703203dSis 		b4 = s[3];
7174703203dSis 	}
7184703203dSis 
7194703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
7204703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7214703203dSis 		return (0);
7224703203dSis 
7234703203dSis 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7244703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7254703203dSis 		return (0);
7264703203dSis 
7274703203dSis 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7284703203dSis 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7294703203dSis 		return (0);
7304703203dSis 
7314703203dSis 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
7324703203dSis }
7334703203dSis 
7344703203dSis /*
7354703203dSis  * The do_decomp() function finds out a matching decomposition if any
7364703203dSis  * and return. If there is no match, the input bytes are copied and returned.
7374703203dSis  * The function also checks if there is a Hangul, decomposes it if necessary
7384703203dSis  * and returns.
7394703203dSis  *
7404703203dSis  * To save time, a single byte 7-bit ASCII character should be handled by
7414703203dSis  * the caller.
7424703203dSis  *
7434703203dSis  * The function returns the number of bytes returned sans always terminating
7444703203dSis  * the null byte. It will also return a state that will tell if there was
7454703203dSis  * a Hangul character decomposed which then will be used by the caller.
7464703203dSis  */
7474703203dSis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7484703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7494703203dSis     boolean_t canonical_decomposition, u8_normalization_states_t *state)
7504703203dSis {
7514703203dSis 	uint16_t b1 = 0;
7524703203dSis 	uint16_t b2 = 0;
7534703203dSis 	uint16_t b3 = 0;
7544703203dSis 	uint16_t b3_tbl;
7554703203dSis 	uint16_t b3_base;
7564703203dSis 	uint16_t b4 = 0;
7574703203dSis 	size_t start_id;
7584703203dSis 	size_t end_id;
7594703203dSis 	size_t i;
7604703203dSis 	uint32_t u1;
7614703203dSis 
7624703203dSis 	if (sz == 2) {
7634703203dSis 		b3 = u8s[0] = s[0];
7644703203dSis 		b4 = u8s[1] = s[1];
7654703203dSis 		u8s[2] = '\0';
7664703203dSis 	} else if (sz == 3) {
7674703203dSis 		/* Convert it to a Unicode scalar value. */
7684703203dSis 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7694703203dSis 
7704703203dSis 		/*
7714703203dSis 		 * If this is a Hangul syllable, we decompose it into
7724703203dSis 		 * a leading consonant, a vowel, and an optional trailing
7734703203dSis 		 * consonant and then return.
7744703203dSis 		 */
7754703203dSis 		if (U8_HANGUL_SYLLABLE(u1)) {
7764703203dSis 			u1 -= U8_HANGUL_SYL_FIRST;
7774703203dSis 
7784703203dSis 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7794703203dSis 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7804703203dSis 			    / U8_HANGUL_T_COUNT;
7814703203dSis 			b3 = u1 % U8_HANGUL_T_COUNT;
7824703203dSis 
7834703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7844703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7854703203dSis 			if (b3) {
7864703203dSis 				b3 += U8_HANGUL_JAMO_T_FIRST;
7874703203dSis 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7884703203dSis 
7894703203dSis 				u8s[9] = '\0';
7904703203dSis 				*state = U8_STATE_HANGUL_LVT;
7914703203dSis 				return (9);
7924703203dSis 			}
7934703203dSis 
7944703203dSis 			u8s[6] = '\0';
7954703203dSis 			*state = U8_STATE_HANGUL_LV;
7964703203dSis 			return (6);
7974703203dSis 		}
7984703203dSis 
7994703203dSis 		b2 = u8s[0] = s[0];
8004703203dSis 		b3 = u8s[1] = s[1];
8014703203dSis 		b4 = u8s[2] = s[2];
8024703203dSis 		u8s[3] = '\0';
8034703203dSis 
8044703203dSis 		/*
8054703203dSis 		 * If this is a Hangul Jamo, we know there is nothing
8064703203dSis 		 * further that we can decompose.
8074703203dSis 		 */
8084703203dSis 		if (U8_HANGUL_JAMO_L(u1)) {
8094703203dSis 			*state = U8_STATE_HANGUL_L;
8104703203dSis 			return (3);
8114703203dSis 		}
8124703203dSis 
8134703203dSis 		if (U8_HANGUL_JAMO_V(u1)) {
8144703203dSis 			if (*state == U8_STATE_HANGUL_L)
8154703203dSis 				*state = U8_STATE_HANGUL_LV;
8164703203dSis 			else
8174703203dSis 				*state = U8_STATE_HANGUL_V;
8184703203dSis 			return (3);
8194703203dSis 		}
8204703203dSis 
8214703203dSis 		if (U8_HANGUL_JAMO_T(u1)) {
8224703203dSis 			if (*state == U8_STATE_HANGUL_LV)
8234703203dSis 				*state = U8_STATE_HANGUL_LVT;
8244703203dSis 			else
8254703203dSis 				*state = U8_STATE_HANGUL_T;
8264703203dSis 			return (3);
8274703203dSis 		}
8284703203dSis 	} else if (sz == 4) {
8294703203dSis 		b1 = u8s[0] = s[0];
8304703203dSis 		b2 = u8s[1] = s[1];
8314703203dSis 		b3 = u8s[2] = s[2];
8324703203dSis 		b4 = u8s[3] = s[3];
8334703203dSis 		u8s[4] = '\0';
8344703203dSis 	} else {
8354703203dSis 		/*
8364703203dSis 		 * This is a fallback and should not happen if the function
8374703203dSis 		 * was called properly.
8384703203dSis 		 */
8394703203dSis 		u8s[0] = s[0];
8404703203dSis 		u8s[1] = '\0';
8414703203dSis 		*state = U8_STATE_START;
8424703203dSis 		return (1);
8434703203dSis 	}
8444703203dSis 
8454703203dSis 	/*
8464703203dSis 	 * At this point, this rountine does not know what it would get.
8474703203dSis 	 * The caller should sort it out if the state isn't a Hangul one.
8484703203dSis 	 */
8494703203dSis 	*state = U8_STATE_START;
8504703203dSis 
8514703203dSis 	/* Try to find matching decomposition mapping byte sequence. */
8524703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
8534703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8544703203dSis 		return ((size_t)sz);
8554703203dSis 
8564703203dSis 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
8574703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8584703203dSis 		return ((size_t)sz);
8594703203dSis 
8604703203dSis 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8614703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8624703203dSis 		return ((size_t)sz);
8634703203dSis 
8644703203dSis 	/*
8654703203dSis 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8664703203dSis 	 * which is 0x8000, this means we couldn't fit the mappings into
8674703203dSis 	 * the cardinality of a unsigned byte.
8684703203dSis 	 */
8694703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8704703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8714703203dSis 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8724703203dSis 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8734703203dSis 	} else {
8744703203dSis 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8754703203dSis 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8764703203dSis 	}
8774703203dSis 
8784703203dSis 	/* This also means there wasn't any matching decomposition. */
8794703203dSis 	if (start_id >= end_id)
8804703203dSis 		return ((size_t)sz);
8814703203dSis 
8824703203dSis 	/*
8834703203dSis 	 * The final table for decomposition mappings has three types of
8844703203dSis 	 * byte sequences depending on whether a mapping is for compatibility
8854703203dSis 	 * decomposition, canonical decomposition, or both like the following:
8864703203dSis 	 *
8874703203dSis 	 * (1) Compatibility decomposition mappings:
8884703203dSis 	 *
8894703203dSis 	 *	+---+---+-...-+---+
8904703203dSis 	 *	| B0| B1| ... | Bm|
8914703203dSis 	 *	+---+---+-...-+---+
8924703203dSis 	 *
8934703203dSis 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8944703203dSis 	 *
8954703203dSis 	 * (2) Canonical decomposition mappings:
8964703203dSis 	 *
8974703203dSis 	 *	+---+---+---+-...-+---+
8984703203dSis 	 *	| T | b0| b1| ... | bn|
8994703203dSis 	 *	+---+---+---+-...-+---+
9004703203dSis 	 *
9014703203dSis 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9024703203dSis 	 *
9034703203dSis 	 * (3) Both mappings:
9044703203dSis 	 *
9054703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9064703203dSis 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9074703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9084703203dSis 	 *
9094703203dSis 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9104703203dSis 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9114703203dSis 	 *	compatibility mapping bytes.
9124703203dSis 	 *
9134703203dSis 	 * Note that compatibility decomposition means doing recursive
9144703203dSis 	 * decompositions using both compatibility decomposition mappings and
9154703203dSis 	 * canonical decomposition mappings. On the other hand, canonical
9164703203dSis 	 * decomposition means doing recursive decompositions using only
9174703203dSis 	 * canonical decomposition mappings. Since the table we have has gone
9184703203dSis 	 * through the recursions already, we do not need to do so during
9194703203dSis 	 * runtime, i.e., the table has been completely flattened out
9204703203dSis 	 * already.
9214703203dSis 	 */
9224703203dSis 
9234703203dSis 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9244703203dSis 
9254703203dSis 	/* Get the type, T, of the byte sequence. */
9264703203dSis 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9274703203dSis 
9284703203dSis 	/*
9294703203dSis 	 * If necessary, adjust start_id, end_id, or both. Note that if
9304703203dSis 	 * this is compatibility decomposition mapping, there is no
9314703203dSis 	 * adjustment.
9324703203dSis 	 */
9334703203dSis 	if (canonical_decomposition) {
9344703203dSis 		/* Is the mapping only for compatibility decomposition? */
9354703203dSis 		if (b1 < U8_DECOMP_BOTH)
9364703203dSis 			return ((size_t)sz);
9374703203dSis 
9384703203dSis 		start_id++;
9394703203dSis 
9404703203dSis 		if (b1 == U8_DECOMP_BOTH) {
9414703203dSis 			end_id = start_id +
9424703203dSis 			    u8_decomp_final_tbl[uv][b3_base + start_id];
9434703203dSis 			start_id++;
9444703203dSis 		}
9454703203dSis 	} else {
9464703203dSis 		/*
9474703203dSis 		 * Unless this is a compatibility decomposition mapping,
9484703203dSis 		 * we adjust the start_id.
9494703203dSis 		 */
9504703203dSis 		if (b1 == U8_DECOMP_BOTH) {
9514703203dSis 			start_id++;
9524703203dSis 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9534703203dSis 		} else if (b1 == U8_DECOMP_CANONICAL) {
9544703203dSis 			start_id++;
9554703203dSis 		}
9564703203dSis 	}
9574703203dSis 
9584703203dSis 	for (i = 0; start_id < end_id; start_id++)
9594703203dSis 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9604703203dSis 	u8s[i] = '\0';
9614703203dSis 
9624703203dSis 	return (i);
9634703203dSis }
9644703203dSis 
9654703203dSis /*
9664703203dSis  * The find_composition_start() function uses the character bytes given and
9674703203dSis  * find out the matching composition mappings if any and return the address
9684703203dSis  * to the composition mappings as explained in the do_composition().
9694703203dSis  */
9704703203dSis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9714703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9724703203dSis {
9734703203dSis 	uint16_t b1 = 0;
9744703203dSis 	uint16_t b2 = 0;
9754703203dSis 	uint16_t b3 = 0;
9764703203dSis 	uint16_t b3_tbl;
9774703203dSis 	uint16_t b3_base;
9784703203dSis 	uint16_t b4 = 0;
9794703203dSis 	size_t start_id;
9804703203dSis 	size_t end_id;
9814703203dSis 
9824703203dSis 	if (sz == 1) {
9834703203dSis 		b4 = s[0];
9844703203dSis 	} else if (sz == 2) {
9854703203dSis 		b3 = s[0];
9864703203dSis 		b4 = s[1];
9874703203dSis 	} else if (sz == 3) {
9884703203dSis 		b2 = s[0];
9894703203dSis 		b3 = s[1];
9904703203dSis 		b4 = s[2];
9914703203dSis 	} else if (sz == 4) {
9924703203dSis 		b1 = s[0];
9934703203dSis 		b2 = s[1];
9944703203dSis 		b3 = s[2];
9954703203dSis 		b4 = s[3];
9964703203dSis 	} else {
9974703203dSis 		/*
9984703203dSis 		 * This is a fallback and should not happen if the function
9994703203dSis 		 * was called properly.
10004703203dSis 		 */
10014703203dSis 		return (NULL);
10024703203dSis 	}
10034703203dSis 
10044703203dSis 	b1 = u8_composition_b1_tbl[uv][b1];
10054703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10064703203dSis 		return (NULL);
10074703203dSis 
10084703203dSis 	b2 = u8_composition_b2_tbl[uv][b1][b2];
10094703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10104703203dSis 		return (NULL);
10114703203dSis 
10124703203dSis 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10134703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10144703203dSis 		return (NULL);
10154703203dSis 
10164703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10174703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10184703203dSis 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10194703203dSis 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10204703203dSis 	} else {
10214703203dSis 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10224703203dSis 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10234703203dSis 	}
10244703203dSis 
10254703203dSis 	if (start_id >= end_id)
10264703203dSis 		return (NULL);
10274703203dSis 
10284703203dSis 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10294703203dSis 
10304703203dSis 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10314703203dSis }
10324703203dSis 
10334703203dSis /*
10344703203dSis  * The blocked() function checks on the combining class values of previous
10354703203dSis  * characters in this sequence and return whether it is blocked or not.
10364703203dSis  */
10374703203dSis static boolean_t
blocked(uchar_t * comb_class,size_t last)10384703203dSis blocked(uchar_t *comb_class, size_t last)
10394703203dSis {
10404703203dSis 	uchar_t my_comb_class;
10414703203dSis 	size_t i;
10424703203dSis 
10434703203dSis 	my_comb_class = comb_class[last];
10444703203dSis 	for (i = 1; i < last; i++)
10454703203dSis 		if (comb_class[i] >= my_comb_class ||
10464703203dSis 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
10474703203dSis 			return (B_TRUE);
10484703203dSis 
10494703203dSis 	return (B_FALSE);
10504703203dSis }
10514703203dSis 
10524703203dSis /*
10534703203dSis  * The do_composition() reads the character string pointed by 's' and
10544703203dSis  * do necessary canonical composition and then copy over the result back to
10554703203dSis  * the 's'.
10564703203dSis  *
10574703203dSis  * The input argument 's' cannot contain more than 32 characters.
10584703203dSis  */
10594703203dSis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10604703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10614703203dSis     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10624703203dSis {
10634703203dSis 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10644703203dSis 	uchar_t tc[U8_MB_CUR_MAX];
10654703203dSis 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10664703203dSis 	size_t saved_marks_count;
10674703203dSis 	uchar_t *p;
10684703203dSis 	uchar_t *saved_p;
10694703203dSis 	uchar_t *q;
10704703203dSis 	size_t i;
10714703203dSis 	size_t saved_i;
10724703203dSis 	size_t j;
10734703203dSis 	size_t k;
10744703203dSis 	size_t l;
10754703203dSis 	size_t C;
10764703203dSis 	size_t saved_l;
10774703203dSis 	size_t size;
10784703203dSis 	uint32_t u1;
10794703203dSis 	uint32_t u2;
10804703203dSis 	boolean_t match_not_found = B_TRUE;
10814703203dSis 
10824703203dSis 	/*
10834703203dSis 	 * This should never happen unless the callers are doing some strange
10844703203dSis 	 * and unexpected things.
10854703203dSis 	 *
10864703203dSis 	 * The "last" is the index pointing to the last character not last + 1.
10874703203dSis 	 */
10884703203dSis 	if (last >= U8_MAX_CHARS_A_SEQ)
10894703203dSis 		last = U8_UPPER_LIMIT_IN_A_SEQ;
10904703203dSis 
10914703203dSis 	for (i = l = 0; i <= last; i++) {
10924703203dSis 		/*
10934703203dSis 		 * The last or any non-Starters at the beginning, we don't
10944703203dSis 		 * have any chance to do composition and so we just copy them
10954703203dSis 		 * to the temporary buffer.
10964703203dSis 		 */
10974703203dSis 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10984703203dSis SAVE_THE_CHAR:
10994703203dSis 			p = s + start[i];
11004703203dSis 			size = disp[i];
11014703203dSis 			for (k = 0; k < size; k++)
11024703203dSis 				t[l++] = *p++;
11034703203dSis 			continue;
11044703203dSis 		}
11054703203dSis 
11064703203dSis 		/*
11074703203dSis 		 * If this could be a start of Hangul Jamos, then, we try to
11084703203dSis 		 * conjoin them.
11094703203dSis 		 */
11104703203dSis 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11114703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11124703203dSis 			    s[start[i] + 1], s[start[i] + 2]);
11134703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11144703203dSis 			    s[start[i] + 4], s[start[i] + 5]);
11154703203dSis 
11164703203dSis 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11174703203dSis 				u1 -= U8_HANGUL_JAMO_L_FIRST;
11184703203dSis 				u2 -= U8_HANGUL_JAMO_V_FIRST;
11194703203dSis 				u1 = U8_HANGUL_SYL_FIRST +
11204703203dSis 				    (u1 * U8_HANGUL_V_COUNT + u2) *
11214703203dSis 				    U8_HANGUL_T_COUNT;
11224703203dSis 
11234703203dSis 				i += 2;
11244703203dSis 				if (i <= last) {
11254703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u2,
11264703203dSis 					    s[start[i]], s[start[i] + 1],
11274703203dSis 					    s[start[i] + 2]);
11284703203dSis 
11294703203dSis 					if (U8_HANGUL_JAMO_T(u2)) {
11304703203dSis 						u1 += u2 -
11314703203dSis 						    U8_HANGUL_JAMO_T_FIRST;
11324703203dSis 						i++;
11334703203dSis 					}
11344703203dSis 				}
11354703203dSis 
11364703203dSis 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11374703203dSis 				i--;
11384703203dSis 				l += 3;
11394703203dSis 				continue;
11404703203dSis 			}
11414703203dSis 		}
11424703203dSis 
11434703203dSis 		/*
11444703203dSis 		 * Let's then find out if this Starter has composition
11454703203dSis 		 * mapping.
11464703203dSis 		 */
11474703203dSis 		p = find_composition_start(uv, s + start[i], disp[i]);
11484703203dSis 		if (p == NULL)
11494703203dSis 			goto SAVE_THE_CHAR;
11504703203dSis 
11514703203dSis 		/*
11524703203dSis 		 * We have a Starter with composition mapping and the next
11534703203dSis 		 * character is a non-Starter. Let's try to find out if
11544703203dSis 		 * we can do composition.
11554703203dSis 		 */
11564703203dSis 
11574703203dSis 		saved_p = p;
11584703203dSis 		saved_i = i;
11594703203dSis 		saved_l = l;
11604703203dSis 		saved_marks_count = 0;
11614703203dSis 
11624703203dSis TRY_THE_NEXT_MARK:
11634703203dSis 		q = s + start[++i];
11644703203dSis 		size = disp[i];
11654703203dSis 
11664703203dSis 		/*
11674703203dSis 		 * The next for() loop compares the non-Starter pointed by
11684703203dSis 		 * 'q' with the possible (joinable) characters pointed by 'p'.
11694703203dSis 		 *
11704703203dSis 		 * The composition final table entry pointed by the 'p'
11714703203dSis 		 * looks like the following:
11724703203dSis 		 *
11734703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11744703203dSis 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11754703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11764703203dSis 		 *
11774703203dSis 		 * where C is the count byte indicating the number of
11784703203dSis 		 * mapping pairs where each pair would be look like
11794703203dSis 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11804703203dSis 		 * character of a canonical decomposition and the B0-Bm are
11814703203dSis 		 * the bytes of a matching composite character. The F is
11824703203dSis 		 * a filler byte after each character as the separator.
11834703203dSis 		 */
11844703203dSis 
11854703203dSis 		match_not_found = B_TRUE;
11864703203dSis 
11874703203dSis 		for (C = *p++; C > 0; C--) {
11884703203dSis 			for (k = 0; k < size; p++, k++)
11894703203dSis 				if (*p != q[k])
11904703203dSis 					break;
11914703203dSis 
11924703203dSis 			/* Have we found it? */
11934703203dSis 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11944703203dSis 				match_not_found = B_FALSE;
11954703203dSis 
11964703203dSis 				l = saved_l;
11974703203dSis 
11984703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
11994703203dSis 					t[l++] = *p;
12004703203dSis 
12014703203dSis 				break;
12024703203dSis 			}
12034703203dSis 
12044703203dSis 			/* We didn't find; skip to the next pair. */
12054703203dSis 			if (*p != U8_TBL_ELEMENT_FILLER)
12064703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
12074703203dSis 					;
12084703203dSis 			while (*++p != U8_TBL_ELEMENT_FILLER)
12094703203dSis 				;
12104703203dSis 			p++;
12114703203dSis 		}
12124703203dSis 
12134703203dSis 		/*
12144703203dSis 		 * If there was no match, we will need to save the combining
12154703203dSis 		 * mark for later appending. After that, if the next one
12164703203dSis 		 * is a non-Starter and not blocked, then, we try once
12174703203dSis 		 * again to do composition with the next non-Starter.
12184703203dSis 		 *
12194703203dSis 		 * If there was no match and this was a Starter, then,
12204703203dSis 		 * this is a new start.
12214703203dSis 		 *
12224703203dSis 		 * If there was a match and a composition done and we have
12234703203dSis 		 * more to check on, then, we retrieve a new composition final
12244703203dSis 		 * table entry for the composite and then try to do the
12254703203dSis 		 * composition again.
12264703203dSis 		 */
12274703203dSis 
12284703203dSis 		if (match_not_found) {
12294703203dSis 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12304703203dSis 				i--;
12314703203dSis 				goto SAVE_THE_CHAR;
12324703203dSis 			}
12334703203dSis 
12344703203dSis 			saved_marks[saved_marks_count++] = i;
12354703203dSis 		}
12364703203dSis 
12374703203dSis 		if (saved_l == l) {
12384703203dSis 			while (i < last) {
12394703203dSis 				if (blocked(comb_class, i + 1))
12404703203dSis 					saved_marks[saved_marks_count++] = ++i;
12414703203dSis 				else
12424703203dSis 					break;
12434703203dSis 			}
12444703203dSis 			if (i < last) {
12454703203dSis 				p = saved_p;
12464703203dSis 				goto TRY_THE_NEXT_MARK;
12474703203dSis 			}
12484703203dSis 		} else if (i < last) {
12494703203dSis 			p = find_composition_start(uv, t + saved_l,
12504703203dSis 			    l - saved_l);
12514703203dSis 			if (p != NULL) {
12524703203dSis 				saved_p = p;
12534703203dSis 				goto TRY_THE_NEXT_MARK;
12544703203dSis 			}
12554703203dSis 		}
12564703203dSis 
12574703203dSis 		/*
12584703203dSis 		 * There is no more composition possible.
12594703203dSis 		 *
12604703203dSis 		 * If there was no composition what so ever then we copy
12614703203dSis 		 * over the original Starter and then append any non-Starters
12624703203dSis 		 * remaining at the target string sequentially after that.
12634703203dSis 		 */
12644703203dSis 
12654703203dSis 		if (saved_l == l) {
12664703203dSis 			p = s + start[saved_i];
12674703203dSis 			size = disp[saved_i];
12684703203dSis 			for (j = 0; j < size; j++)
12694703203dSis 				t[l++] = *p++;
12704703203dSis 		}
12714703203dSis 
12724703203dSis 		for (k = 0; k < saved_marks_count; k++) {
12734703203dSis 			p = s + start[saved_marks[k]];
12744703203dSis 			size = disp[saved_marks[k]];
12754703203dSis 			for (j = 0; j < size; j++)
12764703203dSis 				t[l++] = *p++;
12774703203dSis 		}
12784703203dSis 	}
12794703203dSis 
12804703203dSis 	/*
12814703203dSis 	 * If the last character is a Starter and if we have a character
12824703203dSis 	 * (possibly another Starter) that can be turned into a composite,
12834703203dSis 	 * we do so and we do so until there is no more of composition
12844703203dSis 	 * possible.
12854703203dSis 	 */
12864703203dSis 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12874703203dSis 		p = *os;
12884703203dSis 		saved_l = l - disp[last];
12894703203dSis 
12904703203dSis 		while (p < oslast) {
1291*f137b22eSDan McDonald 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1292*f137b22eSDan McDonald 
1293*f137b22eSDan McDonald 			if (number_of_bytes <= 1)
1294*f137b22eSDan McDonald 				break;
1295*f137b22eSDan McDonald 			size = number_of_bytes;
1296*f137b22eSDan McDonald 			if ((p + size) > oslast)
12974703203dSis 				break;
12984703203dSis 
12994703203dSis 			saved_p = p;
13004703203dSis 
13014703203dSis 			for (i = 0; i < size; i++)
13024703203dSis 				tc[i] = *p++;
13034703203dSis 
13044703203dSis 			q = find_composition_start(uv, t + saved_l,
13054703203dSis 			    l - saved_l);
13064703203dSis 			if (q == NULL) {
13074703203dSis 				p = saved_p;
13084703203dSis 				break;
13094703203dSis 			}
13104703203dSis 
13114703203dSis 			match_not_found = B_TRUE;
13124703203dSis 
13134703203dSis 			for (C = *q++; C > 0; C--) {
13144703203dSis 				for (k = 0; k < size; q++, k++)
13154703203dSis 					if (*q != tc[k])
13164703203dSis 						break;
13174703203dSis 
13184703203dSis 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13194703203dSis 					match_not_found = B_FALSE;
13204703203dSis 
13214703203dSis 					l = saved_l;
13224703203dSis 
13234703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER) {
13244703203dSis 						/*
13254703203dSis 						 * This is practically
13264703203dSis 						 * impossible but we don't
13274703203dSis 						 * want to take any chances.
13284703203dSis 						 */
13294703203dSis 						if (l >=
13304703203dSis 						    U8_STREAM_SAFE_TEXT_MAX) {
13314703203dSis 							p = saved_p;
13324703203dSis 							goto SAFE_RETURN;
13334703203dSis 						}
13344703203dSis 						t[l++] = *q;
13354703203dSis 					}
13364703203dSis 
13374703203dSis 					break;
13384703203dSis 				}
13394703203dSis 
13404703203dSis 				if (*q != U8_TBL_ELEMENT_FILLER)
13414703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER)
13424703203dSis 						;
13434703203dSis 				while (*++q != U8_TBL_ELEMENT_FILLER)
13444703203dSis 					;
13454703203dSis 				q++;
13464703203dSis 			}
13474703203dSis 
13484703203dSis 			if (match_not_found) {
13494703203dSis 				p = saved_p;
13504703203dSis 				break;
13514703203dSis 			}
13524703203dSis 		}
13534703203dSis SAFE_RETURN:
13544703203dSis 		*os = p;
13554703203dSis 	}
13564703203dSis 
13574703203dSis 	/*
13584703203dSis 	 * Now we copy over the temporary string to the target string.
13594703203dSis 	 * Since composition always reduces the number of characters or
13604703203dSis 	 * the number of characters stay, we don't need to worry about
13614703203dSis 	 * the buffer overflow here.
13624703203dSis 	 */
13634703203dSis 	for (i = 0; i < l; i++)
13644703203dSis 		s[i] = t[i];
13654703203dSis 	s[l] = '\0';
13664703203dSis 
13674703203dSis 	return (l);
13684703203dSis }
13694703203dSis 
13704703203dSis /*
13714703203dSis  * The collect_a_seq() function checks on the given string s, collect
13724703203dSis  * a sequence of characters at u8s, and return the sequence. While it collects
13734703203dSis  * a sequence, it also applies case conversion, canonical or compatibility
13744703203dSis  * decomposition, canonical decomposition, or some or all of them and
13754703203dSis  * in that order.
13764703203dSis  *
13774703203dSis  * The collected sequence cannot be bigger than 32 characters since if
13784703203dSis  * it is having more than 31 characters, the sequence will be terminated
13794703203dSis  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13804703203dSis  * a Stream-Safe Text. The collected sequence is always terminated with
13814703203dSis  * a null byte and the return value is the byte length of the sequence
13824703203dSis  * including 0. The return value does not include the terminating
13834703203dSis  * null byte.
13844703203dSis  */
13854703203dSis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13864703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13874703203dSis     boolean_t is_it_toupper,
13884703203dSis     boolean_t is_it_tolower,
13894703203dSis     boolean_t canonical_decomposition,
13904703203dSis     boolean_t compatibility_decomposition,
13914703203dSis     boolean_t canonical_composition,
139285bb5f1dSis     int *errnum, u8_normalization_states_t *state)
13934703203dSis {
13944703203dSis 	uchar_t *s;
13954703203dSis 	int sz;
13964703203dSis 	int saved_sz;
13974703203dSis 	size_t i;
13984703203dSis 	size_t j;
13994703203dSis 	size_t k;
14004703203dSis 	size_t l;
14014703203dSis 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
14024703203dSis 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
14034703203dSis 	uchar_t start[U8_MAX_CHARS_A_SEQ];
14044703203dSis 	uchar_t u8t[U8_MB_CUR_MAX];
14054703203dSis 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14064703203dSis 	uchar_t tc;
14074703203dSis 	size_t last;
14084703203dSis 	size_t saved_last;
14094703203dSis 	uint32_t u1;
14104703203dSis 
14114703203dSis 	/*
14124703203dSis 	 * Save the source string pointer which we will return a changed
14134703203dSis 	 * pointer if we do processing.
14144703203dSis 	 */
14154703203dSis 	s = *source;
14164703203dSis 
14174703203dSis 	/*
14184703203dSis 	 * The following is a fallback for just in case callers are not
14194703203dSis 	 * checking the string boundaries before the calling.
14204703203dSis 	 */
14214703203dSis 	if (s >= slast) {
14224703203dSis 		u8s[0] = '\0';
14234703203dSis 
14244703203dSis 		return (0);
14254703203dSis 	}
14264703203dSis 
14274703203dSis 	/*
14284703203dSis 	 * As the first thing, let's collect a character and do case
14294703203dSis 	 * conversion if necessary.
14304703203dSis 	 */
14314703203dSis 
14324703203dSis 	sz = u8_number_of_bytes[*s];
14334703203dSis 
14344703203dSis 	if (sz < 0) {
143585bb5f1dSis 		*errnum = EILSEQ;
14364703203dSis 
14374703203dSis 		u8s[0] = *s++;
14384703203dSis 		u8s[1] = '\0';
14394703203dSis 
14404703203dSis 		*source = s;
14414703203dSis 
14424703203dSis 		return (1);
14434703203dSis 	}
14444703203dSis 
14454703203dSis 	if (sz == 1) {
14464703203dSis 		if (is_it_toupper)
14474703203dSis 			u8s[0] = U8_ASCII_TOUPPER(*s);
14484703203dSis 		else if (is_it_tolower)
14494703203dSis 			u8s[0] = U8_ASCII_TOLOWER(*s);
14504703203dSis 		else
14514703203dSis 			u8s[0] = *s;
14524703203dSis 		s++;
14534703203dSis 		u8s[1] = '\0';
14544703203dSis 	} else if ((s + sz) > slast) {
145585bb5f1dSis 		*errnum = EINVAL;
14564703203dSis 
14574703203dSis 		for (i = 0; s < slast; )
14584703203dSis 			u8s[i++] = *s++;
14594703203dSis 		u8s[i] = '\0';
14604703203dSis 
14614703203dSis 		*source = s;
14624703203dSis 
14634703203dSis 		return (i);
14644703203dSis 	} else {
14654703203dSis 		if (is_it_toupper || is_it_tolower) {
14664703203dSis 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14674703203dSis 			s += sz;
14684703203dSis 			sz = i;
14694703203dSis 		} else {
14704703203dSis 			for (i = 0; i < sz; )
14714703203dSis 				u8s[i++] = *s++;
14724703203dSis 			u8s[i] = '\0';
14734703203dSis 		}
14744703203dSis 	}
14754703203dSis 
14764703203dSis 	/*
14774703203dSis 	 * And then canonical/compatibility decomposition followed by
14784703203dSis 	 * an optional canonical composition. Please be noted that
14794703203dSis 	 * canonical composition is done only when a decomposition is
14804703203dSis 	 * done.
14814703203dSis 	 */
14824703203dSis 	if (canonical_decomposition || compatibility_decomposition) {
14834703203dSis 		if (sz == 1) {
14844703203dSis 			*state = U8_STATE_START;
14854703203dSis 
14864703203dSis 			saved_sz = 1;
14874703203dSis 
14884703203dSis 			comb_class[0] = 0;
14894703203dSis 			start[0] = 0;
14904703203dSis 			disp[0] = 1;
14914703203dSis 
14924703203dSis 			last = 1;
14934703203dSis 		} else {
14944703203dSis 			saved_sz = do_decomp(uv, u8s, u8s, sz,
14954703203dSis 			    canonical_decomposition, state);
14964703203dSis 
14974703203dSis 			last = 0;
14984703203dSis 
14994703203dSis 			for (i = 0; i < saved_sz; ) {
15004703203dSis 				sz = u8_number_of_bytes[u8s[i]];
15014703203dSis 
15024703203dSis 				comb_class[last] = combining_class(uv,
15034703203dSis 				    u8s + i, sz);
15044703203dSis 				start[last] = i;
15054703203dSis 				disp[last] = sz;
15064703203dSis 
15074703203dSis 				last++;
15084703203dSis 				i += sz;
15094703203dSis 			}
15104703203dSis 
15114703203dSis 			/*
15124703203dSis 			 * Decomposition yields various Hangul related
15134703203dSis 			 * states but not on combining marks. We need to
15144703203dSis 			 * find out at here by checking on the last
15154703203dSis 			 * character.
15164703203dSis 			 */
15174703203dSis 			if (*state == U8_STATE_START) {
15184703203dSis 				if (comb_class[last - 1])
15194703203dSis 					*state = U8_STATE_COMBINING_MARK;
15204703203dSis 			}
15214703203dSis 		}
15224703203dSis 
15234703203dSis 		saved_last = last;
15244703203dSis 
15254703203dSis 		while (s < slast) {
15264703203dSis 			sz = u8_number_of_bytes[*s];
15274703203dSis 
15284703203dSis 			/*
15294703203dSis 			 * If this is an illegal character, an incomplete
15304703203dSis 			 * character, or an 7-bit ASCII Starter character,
15314703203dSis 			 * then we have collected a sequence; break and let
15324703203dSis 			 * the next call deal with the two cases.
15334703203dSis 			 *
15344703203dSis 			 * Note that this is okay only if you are using this
15354703203dSis 			 * function with a fixed length string, not on
15364703203dSis 			 * a buffer with multiple calls of one chunk at a time.
15374703203dSis 			 */
15384703203dSis 			if (sz <= 1) {
15394703203dSis 				break;
15404703203dSis 			} else if ((s + sz) > slast) {
15414703203dSis 				break;
15424703203dSis 			} else {
15434703203dSis 				/*
15444703203dSis 				 * If the previous character was a Hangul Jamo
15454703203dSis 				 * and this character is a Hangul Jamo that
15464703203dSis 				 * can be conjoined, we collect the Jamo.
15474703203dSis 				 */
15484703203dSis 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15494703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u1,
15504703203dSis 					    *s, *(s + 1), *(s + 2));
15514703203dSis 
15524703203dSis 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
15534703203dSis 					    u1)) {
15544703203dSis 						i = 0;
15554703203dSis 						*state = U8_STATE_HANGUL_LV;
15564703203dSis 						goto COLLECT_A_HANGUL;
15574703203dSis 					}
15584703203dSis 
15594703203dSis 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15604703203dSis 					    u1)) {
15614703203dSis 						i = 0;
15624703203dSis 						*state = U8_STATE_HANGUL_LVT;
15634703203dSis 						goto COLLECT_A_HANGUL;
15644703203dSis 					}
15654703203dSis 				}
15664703203dSis 
15674703203dSis 				/*
15684703203dSis 				 * Regardless of whatever it was, if this is
15694703203dSis 				 * a Starter, we don't collect the character
15704703203dSis 				 * since that's a new start and we will deal
15714703203dSis 				 * with it at the next time.
15724703203dSis 				 */
15734703203dSis 				i = combining_class(uv, s, sz);
15744703203dSis 				if (i == U8_COMBINING_CLASS_STARTER)
15754703203dSis 					break;
15764703203dSis 
15774703203dSis 				/*
15784703203dSis 				 * We know the current character is a combining
15794703203dSis 				 * mark. If the previous character wasn't
15804703203dSis 				 * a Starter (not Hangul) or a combining mark,
15814703203dSis 				 * then, we don't collect this combining mark.
15824703203dSis 				 */
15834703203dSis 				if (*state != U8_STATE_START &&
15844703203dSis 				    *state != U8_STATE_COMBINING_MARK)
15854703203dSis 					break;
15864703203dSis 
15874703203dSis 				*state = U8_STATE_COMBINING_MARK;
15884703203dSis COLLECT_A_HANGUL:
15894703203dSis 				/*
15904703203dSis 				 * If we collected a Starter and combining
15914703203dSis 				 * marks up to 30, i.e., total 31 characters,
15924703203dSis 				 * then, we terminate this degenerately long
15934703203dSis 				 * combining sequence with a U+034F COMBINING
15944703203dSis 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15954703203dSis 				 * UTF-8 and turn this into a Stream-Safe
15964703203dSis 				 * Text. This will be extremely rare but
15974703203dSis 				 * possible.
15984703203dSis 				 *
15994703203dSis 				 * The following will also guarantee that
16004703203dSis 				 * we are not writing more than 32 characters
16014703203dSis 				 * plus a NULL at u8s[].
16024703203dSis 				 */
16034703203dSis 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
16044703203dSis TURN_STREAM_SAFE:
16054703203dSis 					*state = U8_STATE_START;
16064703203dSis 					comb_class[last] = 0;
16074703203dSis 					start[last] = saved_sz;
16084703203dSis 					disp[last] = 2;
16094703203dSis 					last++;
16104703203dSis 
16114703203dSis 					u8s[saved_sz++] = 0xCD;
16124703203dSis 					u8s[saved_sz++] = 0x8F;
16134703203dSis 
16144703203dSis 					break;
16154703203dSis 				}
16164703203dSis 
16174703203dSis 				/*
16184703203dSis 				 * Some combining marks also do decompose into
16194703203dSis 				 * another combining mark or marks.
16204703203dSis 				 */
16214703203dSis 				if (*state == U8_STATE_COMBINING_MARK) {
16224703203dSis 					k = last;
16234703203dSis 					l = sz;
16244703203dSis 					i = do_decomp(uv, uts, s, sz,
16254703203dSis 					    canonical_decomposition, state);
16264703203dSis 					for (j = 0; j < i; ) {
16274703203dSis 						sz = u8_number_of_bytes[uts[j]];
16284703203dSis 
16294703203dSis 						comb_class[last] =
16304703203dSis 						    combining_class(uv,
16314703203dSis 						    uts + j, sz);
16324703203dSis 						start[last] = saved_sz + j;
16334703203dSis 						disp[last] = sz;
16344703203dSis 
16354703203dSis 						last++;
16364703203dSis 						if (last >=
16374703203dSis 						    U8_UPPER_LIMIT_IN_A_SEQ) {
16384703203dSis 							last = k;
16394703203dSis 							goto TURN_STREAM_SAFE;
16404703203dSis 						}
16414703203dSis 						j += sz;
16424703203dSis 					}
16434703203dSis 
16444703203dSis 					*state = U8_STATE_COMBINING_MARK;
16454703203dSis 					sz = i;
16464703203dSis 					s += l;
16474703203dSis 
16484703203dSis 					for (i = 0; i < sz; i++)
16494703203dSis 						u8s[saved_sz++] = uts[i];
16504703203dSis 				} else {
16514703203dSis 					comb_class[last] = i;
16524703203dSis 					start[last] = saved_sz;
16534703203dSis 					disp[last] = sz;
16544703203dSis 					last++;
16554703203dSis 
16564703203dSis 					for (i = 0; i < sz; i++)
16574703203dSis 						u8s[saved_sz++] = *s++;
16584703203dSis 				}
16594703203dSis 
16604703203dSis 				/*
16614703203dSis 				 * If this is U+0345 COMBINING GREEK
16624703203dSis 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16634703203dSis 				 * iota subscript, and need to be converted to
16644703203dSis 				 * uppercase letter, convert it to U+0399 GREEK
16654703203dSis 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16664703203dSis 				 * i.e., convert to capital adscript form as
16674703203dSis 				 * specified in the Unicode standard.
16684703203dSis 				 *
16694703203dSis 				 * This is the only special case of (ambiguous)
16704703203dSis 				 * case conversion at combining marks and
16714703203dSis 				 * probably the standard will never have
16724703203dSis 				 * anything similar like this in future.
16734703203dSis 				 */
16744703203dSis 				if (is_it_toupper && sz >= 2 &&
16754703203dSis 				    u8s[saved_sz - 2] == 0xCD &&
16764703203dSis 				    u8s[saved_sz - 1] == 0x85) {
16774703203dSis 					u8s[saved_sz - 2] = 0xCE;
16784703203dSis 					u8s[saved_sz - 1] = 0x99;
16794703203dSis 				}
16804703203dSis 			}
16814703203dSis 		}
16824703203dSis 
16834703203dSis 		/*
16844703203dSis 		 * Let's try to ensure a canonical ordering for the collected
16854703203dSis 		 * combining marks. We do this only if we have collected
16864703203dSis 		 * at least one more non-Starter. (The decomposition mapping
16874703203dSis 		 * data tables have fully (and recursively) expanded and
16884703203dSis 		 * canonically ordered decompositions.)
16894703203dSis 		 *
16904703203dSis 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
16914703203dSis 		 * assumptions and we are meeting the assumptions.
16924703203dSis 		 */
16934703203dSis 		last--;
16944703203dSis 		if (last >= saved_last) {
16954703203dSis 			for (i = 0; i < last; i++)
16964703203dSis 				for (j = last; j > i; j--)
16974703203dSis 					if (comb_class[j] &&
16984703203dSis 					    comb_class[j - 1] > comb_class[j]) {
16994703203dSis 						U8_SWAP_COMB_MARKS(j - 1, j);
17004703203dSis 					}
17014703203dSis 		}
17024703203dSis 
17034703203dSis 		*source = s;
17044703203dSis 
17054703203dSis 		if (! canonical_composition) {
17064703203dSis 			u8s[saved_sz] = '\0';
17074703203dSis 			return (saved_sz);
17084703203dSis 		}
17094703203dSis 
17104703203dSis 		/*
17114703203dSis 		 * Now do the canonical composition. Note that we do this
17124703203dSis 		 * only after a canonical or compatibility decomposition to
17134703203dSis 		 * finish up NFC or NFKC.
17144703203dSis 		 */
17154703203dSis 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
17164703203dSis 		    &s, slast);
17174703203dSis 	}
17184703203dSis 
17194703203dSis 	*source = s;
17204703203dSis 
17214703203dSis 	return ((size_t)sz);
17224703203dSis }
17234703203dSis 
17244703203dSis /*
17254703203dSis  * The do_norm_compare() function does string comparion based on Unicode
17264703203dSis  * simple case mappings and Unicode Normalization definitions.
17274703203dSis  *
17284703203dSis  * It does so by collecting a sequence of character at a time and comparing
17294703203dSis  * the collected sequences from the strings.
17304703203dSis  *
17314703203dSis  * The meanings on the return values are the same as the usual strcmp().
17324703203dSis  */
17334703203dSis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17344703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
173585bb5f1dSis     int flag, int *errnum)
17364703203dSis {
17374703203dSis 	int result;
17384703203dSis 	size_t sz1;
17394703203dSis 	size_t sz2;
17404703203dSis 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17414703203dSis 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17424703203dSis 	uchar_t *s1last;
17434703203dSis 	uchar_t *s2last;
17444703203dSis 	boolean_t is_it_toupper;
17454703203dSis 	boolean_t is_it_tolower;
17464703203dSis 	boolean_t canonical_decomposition;
17474703203dSis 	boolean_t compatibility_decomposition;
17484703203dSis 	boolean_t canonical_composition;
17494703203dSis 	u8_normalization_states_t state;
17504703203dSis 
17514703203dSis 	s1last = s1 + n1;
17524703203dSis 	s2last = s2 + n2;
17534703203dSis 
17544703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17554703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17564703203dSis 	canonical_decomposition = flag & U8_CANON_DECOMP;
17574703203dSis 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17584703203dSis 	canonical_composition = flag & U8_CANON_COMP;
17594703203dSis 
17604703203dSis 	while (s1 < s1last && s2 < s2last) {
17614703203dSis 		/*
17624703203dSis 		 * If the current character is a 7-bit ASCII and the last
17634703203dSis 		 * character, or, if the current character and the next
17644703203dSis 		 * character are both some 7-bit ASCII characters then
17654703203dSis 		 * we treat the current character as a sequence.
17664703203dSis 		 *
17674703203dSis 		 * In any other cases, we need to call collect_a_seq().
17684703203dSis 		 */
17694703203dSis 
17704703203dSis 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17714703203dSis 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17724703203dSis 			if (is_it_toupper)
17734703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
17744703203dSis 			else if (is_it_tolower)
17754703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
17764703203dSis 			else
17774703203dSis 				u8s1[0] = *s1;
17784703203dSis 			u8s1[1] = '\0';
17794703203dSis 			sz1 = 1;
17804703203dSis 			s1++;
17814703203dSis 		} else {
17824703203dSis 			state = U8_STATE_START;
17834703203dSis 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17844703203dSis 			    is_it_toupper, is_it_tolower,
17854703203dSis 			    canonical_decomposition,
17864703203dSis 			    compatibility_decomposition,
178785bb5f1dSis 			    canonical_composition, errnum, &state);
17884703203dSis 		}
17894703203dSis 
17904703203dSis 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17914703203dSis 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17924703203dSis 			if (is_it_toupper)
17934703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
17944703203dSis 			else if (is_it_tolower)
17954703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
17964703203dSis 			else
17974703203dSis 				u8s2[0] = *s2;
17984703203dSis 			u8s2[1] = '\0';
17994703203dSis 			sz2 = 1;
18004703203dSis 			s2++;
18014703203dSis 		} else {
18024703203dSis 			state = U8_STATE_START;
18034703203dSis 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
18044703203dSis 			    is_it_toupper, is_it_tolower,
18054703203dSis 			    canonical_decomposition,
18064703203dSis 			    compatibility_decomposition,
180785bb5f1dSis 			    canonical_composition, errnum, &state);
18084703203dSis 		}
18094703203dSis 
18104703203dSis 		/*
18114703203dSis 		 * Now compare the two characters. If they are the same,
18124703203dSis 		 * we move on to the next character sequences.
18134703203dSis 		 */
18144703203dSis 		if (sz1 == 1 && sz2 == 1) {
18154703203dSis 			if (*u8s1 > *u8s2)
18164703203dSis 				return (1);
18174703203dSis 			if (*u8s1 < *u8s2)
18184703203dSis 				return (-1);
18194703203dSis 		} else {
18204703203dSis 			result = strcmp((const char *)u8s1, (const char *)u8s2);
18214703203dSis 			if (result != 0)
18224703203dSis 				return (result);
18234703203dSis 		}
18244703203dSis 	}
18254703203dSis 
18264703203dSis 	/*
18274703203dSis 	 * We compared until the end of either or both strings.
18284703203dSis 	 *
18294703203dSis 	 * If we reached to or went over the ends for the both, that means
18304703203dSis 	 * they are the same.
18314703203dSis 	 *
18324703203dSis 	 * If we reached only one end, that means the other string has
18334703203dSis 	 * something which then can be used to determine the return value.
18344703203dSis 	 */
18354703203dSis 	if (s1 >= s1last) {
18364703203dSis 		if (s2 >= s2last)
18374703203dSis 			return (0);
18384703203dSis 		return (-1);
18394703203dSis 	}
18404703203dSis 	return (1);
18414703203dSis }
18424703203dSis 
18434703203dSis /*
18444703203dSis  * The u8_strcmp() function compares two UTF-8 strings quite similar to
18454703203dSis  * the strcmp(). For the comparison, however, Unicode Normalization specific
18464703203dSis  * equivalency and Unicode simple case conversion mappings based equivalency
18474703203dSis  * can be requested and checked against.
18484703203dSis  */
18494703203dSis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18504703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
185185bb5f1dSis     int *errnum)
18524703203dSis {
18534703203dSis 	int f;
18544703203dSis 	size_t n1;
18554703203dSis 	size_t n2;
18564703203dSis 
185785bb5f1dSis 	*errnum = 0;
18584703203dSis 
18594703203dSis 	/*
18604703203dSis 	 * Check on the requested Unicode version, case conversion, and
18614703203dSis 	 * normalization flag values.
18624703203dSis 	 */
18634703203dSis 
18644703203dSis 	if (uv > U8_UNICODE_LATEST) {
186585bb5f1dSis 		*errnum = ERANGE;
18664703203dSis 		uv = U8_UNICODE_LATEST;
18674703203dSis 	}
18684703203dSis 
18694703203dSis 	if (flag == 0) {
18704703203dSis 		flag = U8_STRCMP_CS;
18714703203dSis 	} else {
18724703203dSis 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18734703203dSis 		    U8_STRCMP_CI_LOWER);
18744703203dSis 		if (f == 0) {
18754703203dSis 			flag |= U8_STRCMP_CS;
18764703203dSis 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18774703203dSis 		    f != U8_STRCMP_CI_LOWER) {
187885bb5f1dSis 			*errnum = EBADF;
18794703203dSis 			flag = U8_STRCMP_CS;
18804703203dSis 		}
18814703203dSis 
18824703203dSis 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18834703203dSis 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18844703203dSis 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
188585bb5f1dSis 			*errnum = EBADF;
18864703203dSis 			flag = U8_STRCMP_CS;
18874703203dSis 		}
18884703203dSis 	}
18894703203dSis 
18904703203dSis 	if (flag == U8_STRCMP_CS) {
18914703203dSis 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18924703203dSis 	}
18934703203dSis 
18944703203dSis 	n1 = strlen(s1);
18954703203dSis 	n2 = strlen(s2);
18964703203dSis 	if (n != 0) {
18974703203dSis 		if (n < n1)
18984703203dSis 			n1 = n;
18994703203dSis 		if (n < n2)
19004703203dSis 			n2 = n;
19014703203dSis 	}
19024703203dSis 
19034703203dSis 	/*
19044703203dSis 	 * Simple case conversion can be done much faster and so we do
19054703203dSis 	 * them separately here.
19064703203dSis 	 */
19074703203dSis 	if (flag == U8_STRCMP_CI_UPPER) {
19084703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
190985bb5f1dSis 		    n1, n2, B_TRUE, errnum));
19104703203dSis 	} else if (flag == U8_STRCMP_CI_LOWER) {
19114703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
191285bb5f1dSis 		    n1, n2, B_FALSE, errnum));
19134703203dSis 	}
19144703203dSis 
19154703203dSis 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
191685bb5f1dSis 	    flag, errnum));
19174703203dSis }
19184703203dSis 
19194703203dSis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19204703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
192185bb5f1dSis     int flag, size_t unicode_version, int *errnum)
19224703203dSis {
19234703203dSis 	int f;
19244703203dSis 	int sz;
19254703203dSis 	uchar_t *ib;
19264703203dSis 	uchar_t *ibtail;
19274703203dSis 	uchar_t *ob;
19284703203dSis 	uchar_t *obtail;
19294703203dSis 	boolean_t do_not_ignore_null;
19304703203dSis 	boolean_t do_not_ignore_invalid;
19314703203dSis 	boolean_t is_it_toupper;
19324703203dSis 	boolean_t is_it_tolower;
19334703203dSis 	boolean_t canonical_decomposition;
19344703203dSis 	boolean_t compatibility_decomposition;
19354703203dSis 	boolean_t canonical_composition;
19364703203dSis 	size_t ret_val;
19374703203dSis 	size_t i;
19384703203dSis 	size_t j;
19394703203dSis 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19404703203dSis 	u8_normalization_states_t state;
19414703203dSis 
19424703203dSis 	if (unicode_version > U8_UNICODE_LATEST) {
194385bb5f1dSis 		*errnum = ERANGE;
19444703203dSis 		return ((size_t)-1);
19454703203dSis 	}
19464703203dSis 
19474703203dSis 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19484703203dSis 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
194985bb5f1dSis 		*errnum = EBADF;
19504703203dSis 		return ((size_t)-1);
19514703203dSis 	}
19524703203dSis 
19534703203dSis 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19544703203dSis 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19554703203dSis 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
195685bb5f1dSis 		*errnum = EBADF;
19574703203dSis 		return ((size_t)-1);
19584703203dSis 	}
19594703203dSis 
19604703203dSis 	if (inarray == NULL || *inlen == 0)
19614703203dSis 		return (0);
19624703203dSis 
19634703203dSis 	if (outarray == NULL) {
196485bb5f1dSis 		*errnum = E2BIG;
19654703203dSis 		return ((size_t)-1);
19664703203dSis 	}
19674703203dSis 
19684703203dSis 	ib = (uchar_t *)inarray;
19694703203dSis 	ob = (uchar_t *)outarray;
19704703203dSis 	ibtail = ib + *inlen;
19714703203dSis 	obtail = ob + *outlen;
19724703203dSis 
19734703203dSis 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19744703203dSis 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19754703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19764703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19774703203dSis 
19784703203dSis 	ret_val = 0;
19794703203dSis 
19804703203dSis 	/*
19814703203dSis 	 * If we don't have a normalization flag set, we do the simple case
19824703203dSis 	 * conversion based text preparation separately below. Text
19834703203dSis 	 * preparation involving Normalization will be done in the false task
19844703203dSis 	 * block, again, separately since it will take much more time and
19854703203dSis 	 * resource than doing simple case conversions.
19864703203dSis 	 */
19874703203dSis 	if (f == 0) {
19884703203dSis 		while (ib < ibtail) {
19894703203dSis 			if (*ib == '\0' && do_not_ignore_null)
19904703203dSis 				break;
19914703203dSis 
19924703203dSis 			sz = u8_number_of_bytes[*ib];
19934703203dSis 
19944703203dSis 			if (sz < 0) {
19954703203dSis 				if (do_not_ignore_invalid) {
199685bb5f1dSis 					*errnum = EILSEQ;
19974703203dSis 					ret_val = (size_t)-1;
19984703203dSis 					break;
19994703203dSis 				}
20004703203dSis 
20014703203dSis 				sz = 1;
20024703203dSis 				ret_val++;
20034703203dSis 			}
20044703203dSis 
20054703203dSis 			if (sz == 1) {
20064703203dSis 				if (ob >= obtail) {
200785bb5f1dSis 					*errnum = E2BIG;
20084703203dSis 					ret_val = (size_t)-1;
20094703203dSis 					break;
20104703203dSis 				}
20114703203dSis 
20124703203dSis 				if (is_it_toupper)
20134703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
20144703203dSis 				else if (is_it_tolower)
20154703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
20164703203dSis 				else
20174703203dSis 					*ob = *ib;
20184703203dSis 				ib++;
20194703203dSis 				ob++;
20204703203dSis 			} else if ((ib + sz) > ibtail) {
20214703203dSis 				if (do_not_ignore_invalid) {
202285bb5f1dSis 					*errnum = EINVAL;
20234703203dSis 					ret_val = (size_t)-1;
20244703203dSis 					break;
20254703203dSis 				}
20264703203dSis 
20274703203dSis 				if ((obtail - ob) < (ibtail - ib)) {
202885bb5f1dSis 					*errnum = E2BIG;
20294703203dSis 					ret_val = (size_t)-1;
20304703203dSis 					break;
20314703203dSis 				}
20324703203dSis 
20334703203dSis 				/*
20344703203dSis 				 * We treat the remaining incomplete character
20354703203dSis 				 * bytes as a character.
20364703203dSis 				 */
20374703203dSis 				ret_val++;
20384703203dSis 
20394703203dSis 				while (ib < ibtail)
20404703203dSis 					*ob++ = *ib++;
20414703203dSis 			} else {
20424703203dSis 				if (is_it_toupper || is_it_tolower) {
20434703203dSis 					i = do_case_conv(unicode_version, u8s,
20444703203dSis 					    ib, sz, is_it_toupper);
20454703203dSis 
20464703203dSis 					if ((obtail - ob) < i) {
204785bb5f1dSis 						*errnum = E2BIG;
20484703203dSis 						ret_val = (size_t)-1;
20494703203dSis 						break;
20504703203dSis 					}
20514703203dSis 
20524703203dSis 					ib += sz;
20534703203dSis 
20544703203dSis 					for (sz = 0; sz < i; sz++)
20554703203dSis 						*ob++ = u8s[sz];
20564703203dSis 				} else {
20574703203dSis 					if ((obtail - ob) < sz) {
205885bb5f1dSis 						*errnum = E2BIG;
20594703203dSis 						ret_val = (size_t)-1;
20604703203dSis 						break;
20614703203dSis 					}
20624703203dSis 
20634703203dSis 					for (i = 0; i < sz; i++)
20644703203dSis 						*ob++ = *ib++;
20654703203dSis 				}
20664703203dSis 			}
20674703203dSis 		}
20684703203dSis 	} else {
20694703203dSis 		canonical_decomposition = flag & U8_CANON_DECOMP;
20704703203dSis 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20714703203dSis 		canonical_composition = flag & U8_CANON_COMP;
20724703203dSis 
20734703203dSis 		while (ib < ibtail) {
20744703203dSis 			if (*ib == '\0' && do_not_ignore_null)
20754703203dSis 				break;
20764703203dSis 
20774703203dSis 			/*
20784703203dSis 			 * If the current character is a 7-bit ASCII
20794703203dSis 			 * character and it is the last character, or,
20804703203dSis 			 * if the current character is a 7-bit ASCII
20814703203dSis 			 * character and the next character is also a 7-bit
20824703203dSis 			 * ASCII character, then, we copy over this
20834703203dSis 			 * character without going through collect_a_seq().
20844703203dSis 			 *
20854703203dSis 			 * In any other cases, we need to look further with
20864703203dSis 			 * the collect_a_seq() function.
20874703203dSis 			 */
20884703203dSis 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20894703203dSis 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20904703203dSis 				if (ob >= obtail) {
209185bb5f1dSis 					*errnum = E2BIG;
20924703203dSis 					ret_val = (size_t)-1;
20934703203dSis 					break;
20944703203dSis 				}
20954703203dSis 
20964703203dSis 				if (is_it_toupper)
20974703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
20984703203dSis 				else if (is_it_tolower)
20994703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
21004703203dSis 				else
21014703203dSis 					*ob = *ib;
21024703203dSis 				ib++;
21034703203dSis 				ob++;
21044703203dSis 			} else {
210585bb5f1dSis 				*errnum = 0;
21064703203dSis 				state = U8_STATE_START;
21074703203dSis 
21084703203dSis 				j = collect_a_seq(unicode_version, u8s,
21094703203dSis 				    &ib, ibtail,
21104703203dSis 				    is_it_toupper,
21114703203dSis 				    is_it_tolower,
21124703203dSis 				    canonical_decomposition,
21134703203dSis 				    compatibility_decomposition,
21144703203dSis 				    canonical_composition,
211585bb5f1dSis 				    errnum, &state);
21164703203dSis 
211785bb5f1dSis 				if (*errnum && do_not_ignore_invalid) {
21184703203dSis 					ret_val = (size_t)-1;
21194703203dSis 					break;
21204703203dSis 				}
21214703203dSis 
21224703203dSis 				if ((obtail - ob) < j) {
212385bb5f1dSis 					*errnum = E2BIG;
21244703203dSis 					ret_val = (size_t)-1;
21254703203dSis 					break;
21264703203dSis 				}
21274703203dSis 
21284703203dSis 				for (i = 0; i < j; i++)
21294703203dSis 					*ob++ = u8s[i];
21304703203dSis 			}
21314703203dSis 		}
21324703203dSis 	}
21334703203dSis 
21344703203dSis 	*inlen = ibtail - ib;
21354703203dSis 	*outlen = obtail - ob;
21364703203dSis 
21374703203dSis 	return (ret_val);
21384703203dSis }
2139