14703203dSis /*
24703203dSis * CDDL HEADER START
34703203dSis *
44703203dSis * The contents of this file are subject to the terms of the
54703203dSis * Common Development and Distribution License (the "License").
64703203dSis * You may not use this file except in compliance with the License.
74703203dSis *
84703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis * or http://www.opensolaris.org/os/licensing.
104703203dSis * See the License for the specific language governing permissions
114703203dSis * and limitations under the License.
124703203dSis *
134703203dSis * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis * If applicable, add the following below this CDDL HEADER, with the
164703203dSis * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis *
194703203dSis * CDDL HEADER END
204703203dSis */
214703203dSis /*
2285bb5f1dSis * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
234703203dSis * Use is subject to license terms.
244703203dSis */
254703203dSis
26*f137b22eSDan McDonald /*
27*f137b22eSDan McDonald * Copyright 2022 MNX Cloud, Inc.
28*f137b22eSDan McDonald */
294703203dSis
304703203dSis
314703203dSis /*
324703203dSis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
334703203dSis *
344703203dSis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
354703203dSis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
364703203dSis * the section 3C man pages.
374703203dSis * Interface stability: Committed.
384703203dSis */
394703203dSis
404703203dSis #include <sys/types.h>
414703203dSis #ifdef _KERNEL
424703203dSis #include <sys/param.h>
434703203dSis #include <sys/sysmacros.h>
444703203dSis #include <sys/systm.h>
454703203dSis #include <sys/debug.h>
464703203dSis #include <sys/kmem.h>
474703203dSis #include <sys/ddi.h>
484703203dSis #include <sys/sunddi.h>
494703203dSis #else
504703203dSis #include <sys/u8_textprep.h>
514703203dSis #include <strings.h>
524703203dSis #endif /* _KERNEL */
534703203dSis #include <sys/byteorder.h>
544703203dSis #include <sys/errno.h>
554703203dSis #include <sys/u8_textprep_data.h>
564703203dSis
574703203dSis
584703203dSis /* The maximum possible number of bytes in a UTF-8 character. */
594703203dSis #define U8_MB_CUR_MAX (4)
604703203dSis
614703203dSis /*
624703203dSis * The maximum number of bytes needed for a UTF-8 character to cover
634703203dSis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
644703203dSis */
654703203dSis #define U8_MAX_BYTES_UCS2 (3)
664703203dSis
674703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */
684703203dSis #define U8_STREAM_SAFE_TEXT_MAX (128)
694703203dSis
704703203dSis /*
714703203dSis * The maximum number of characters in a combining/conjoining sequence and
724703203dSis * the actual upperbound limit of a combining/conjoining sequence.
734703203dSis */
744703203dSis #define U8_MAX_CHARS_A_SEQ (32)
754703203dSis #define U8_UPPER_LIMIT_IN_A_SEQ (31)
764703203dSis
774703203dSis /* The combining class value for Starter. */
784703203dSis #define U8_COMBINING_CLASS_STARTER (0)
794703203dSis
804703203dSis /*
814703203dSis * Some Hangul related macros at below.
824703203dSis *
834703203dSis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
844703203dSis * Vowels, and optional Trailing consonants in Unicode scalar values.
854703203dSis *
864703203dSis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
874703203dSis * the actual U+11A8. This is due to that the trailing consonant is optional
884703203dSis * and thus we are doing a pre-calculation of subtracting one.
894703203dSis *
904703203dSis * Each of 19 modern leading consonants has total 588 possible syllables since
914703203dSis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
924703203dSis * no trailing consonant case, i.e., 21 x 28 = 588.
934703203dSis *
944703203dSis * We also have bunch of Hangul related macros at below. Please bear in mind
954703203dSis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
964703203dSis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
974703203dSis * Jamo; it just guarantee that it will be most likely.
984703203dSis */
994703203dSis #define U8_HANGUL_SYL_FIRST (0xAC00U)
1004703203dSis #define U8_HANGUL_SYL_LAST (0xD7A3U)
1014703203dSis
1024703203dSis #define U8_HANGUL_JAMO_L_FIRST (0x1100U)
1034703203dSis #define U8_HANGUL_JAMO_L_LAST (0x1112U)
1044703203dSis #define U8_HANGUL_JAMO_V_FIRST (0x1161U)
1054703203dSis #define U8_HANGUL_JAMO_V_LAST (0x1175U)
1064703203dSis #define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
1074703203dSis #define U8_HANGUL_JAMO_T_LAST (0x11C2U)
1084703203dSis
1094703203dSis #define U8_HANGUL_V_COUNT (21)
1104703203dSis #define U8_HANGUL_VT_COUNT (588)
1114703203dSis #define U8_HANGUL_T_COUNT (28)
1124703203dSis
1134703203dSis #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
1144703203dSis
1154703203dSis #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1164703203dSis (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1174703203dSis (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1184703203dSis (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1194703203dSis
1204703203dSis #define U8_HANGUL_JAMO_L(u) \
1214703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1224703203dSis
1234703203dSis #define U8_HANGUL_JAMO_V(u) \
1244703203dSis ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1254703203dSis
1264703203dSis #define U8_HANGUL_JAMO_T(u) \
1274703203dSis ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1284703203dSis
1294703203dSis #define U8_HANGUL_JAMO(u) \
1304703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1314703203dSis
1324703203dSis #define U8_HANGUL_SYLLABLE(u) \
1334703203dSis ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1344703203dSis
1354703203dSis #define U8_HANGUL_COMPOSABLE_L_V(s, u) \
1364703203dSis ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1374703203dSis
1384703203dSis #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1394703203dSis ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1404703203dSis
1414703203dSis /* The types of decomposition mappings. */
1424703203dSis #define U8_DECOMP_BOTH (0xF5U)
1434703203dSis #define U8_DECOMP_CANONICAL (0xF6U)
1444703203dSis
1454703203dSis /* The indicator for 16-bit table. */
1464703203dSis #define U8_16BIT_TABLE_INDICATOR (0x8000U)
1474703203dSis
1484703203dSis /* The following are some convenience macros. */
1494703203dSis #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1504703203dSis (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1514703203dSis (uint32_t)(b3) & 0x3F;
1524703203dSis
1534703203dSis #define U8_SIMPLE_SWAP(a, b, t) \
1544703203dSis (t) = (a); \
1554703203dSis (a) = (b); \
1564703203dSis (b) = (t);
1574703203dSis
1584703203dSis #define U8_ASCII_TOUPPER(c) \
1594703203dSis (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1604703203dSis
1614703203dSis #define U8_ASCII_TOLOWER(c) \
1624703203dSis (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1634703203dSis
1644703203dSis #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
1654703203dSis /*
1664703203dSis * The following macro assumes that the two characters that are to be
1674703203dSis * swapped are adjacent to each other and 'a' comes before 'b'.
1684703203dSis *
1694703203dSis * If the assumptions are not met, then, the macro will fail.
1704703203dSis */
1714703203dSis #define U8_SWAP_COMB_MARKS(a, b) \
1724703203dSis for (k = 0; k < disp[(a)]; k++) \
1734703203dSis u8t[k] = u8s[start[(a)] + k]; \
1744703203dSis for (k = 0; k < disp[(b)]; k++) \
1754703203dSis u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1764703203dSis start[(b)] = start[(a)] + disp[(b)]; \
1774703203dSis for (k = 0; k < disp[(a)]; k++) \
1784703203dSis u8s[start[(b)] + k] = u8t[k]; \
1794703203dSis U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1804703203dSis U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1814703203dSis
1824703203dSis /* The possible states during normalization. */
1834703203dSis typedef enum {
1844703203dSis U8_STATE_START = 0,
1854703203dSis U8_STATE_HANGUL_L = 1,
1864703203dSis U8_STATE_HANGUL_LV = 2,
1874703203dSis U8_STATE_HANGUL_LVT = 3,
1884703203dSis U8_STATE_HANGUL_V = 4,
1894703203dSis U8_STATE_HANGUL_T = 5,
1904703203dSis U8_STATE_COMBINING_MARK = 6
1914703203dSis } u8_normalization_states_t;
1924703203dSis
1934703203dSis /*
1944703203dSis * The three vectors at below are used to check bytes of a given UTF-8
1954703203dSis * character are valid and not containing any malformed byte values.
1964703203dSis *
1974703203dSis * We used to have a quite relaxed UTF-8 binary representation but then there
1984703203dSis * was some security related issues and so the Unicode Consortium defined
1994703203dSis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
2004703203dSis * one more time at the Unicode 3.2. The following three tables are based on
2014703203dSis * that.
2024703203dSis */
2034703203dSis
2044703203dSis #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
2054703203dSis
2064703203dSis #define I_ U8_ILLEGAL_CHAR
2074703203dSis #define O_ U8_OUT_OF_RANGE_CHAR
2084703203dSis
2094703203dSis const int8_t u8_number_of_bytes[0x100] = {
2104703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2114703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2124703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2134703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2144703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2164703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2174703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2184703203dSis
2194703203dSis /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
2204703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2214703203dSis
2224703203dSis /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
2234703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2244703203dSis
2254703203dSis /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
2264703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2274703203dSis
2284703203dSis /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
2294703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2304703203dSis
2314703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
2324703203dSis I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2334703203dSis
2344703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
2354703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2364703203dSis
2374703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
2384703203dSis 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2394703203dSis
2404703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
2414703203dSis 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2424703203dSis };
2434703203dSis
2444703203dSis #undef I_
2454703203dSis #undef O_
2464703203dSis
2474703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2484703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2494703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2504703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2514703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2524703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2534703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2544703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2554703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2564703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2574703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2584703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2594703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2604703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2614703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2624703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2634703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2644703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2654703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2664703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2674703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2684703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2694703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2704703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2714703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2724703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2734703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2744703203dSis /* C8 C9 CA CB CC CD CE CF */
2754703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2764703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2774703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2784703203dSis /* D8 D9 DA DB DC DD DE DF */
2794703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2804703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2814703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2824703203dSis /* E8 E9 EA EB EC ED EE EF */
2834703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2844703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2854703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
2864703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2874703203dSis };
2884703203dSis
2894703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2904703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2914703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2924703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2934703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2944703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2954703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2964703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2974703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2984703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2994703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3004703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3014703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3024703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3034703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3044703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3054703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3064703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3074703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3084703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3094703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3104703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3114703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3124703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3134703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3144703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
3154703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3164703203dSis /* C8 C9 CA CB CC CD CE CF */
3174703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3184703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
3194703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3204703203dSis /* D8 D9 DA DB DC DD DE DF */
3214703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3224703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
3234703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3244703203dSis /* E8 E9 EA EB EC ED EE EF */
3254703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3264703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
3274703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
3284703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3294703203dSis };
3304703203dSis
3314703203dSis
3324703203dSis /*
3334703203dSis * The u8_validate() validates on the given UTF-8 character string and
3344703203dSis * calculate the byte length. It is quite similar to mblen(3C) except that
3354703203dSis * this will validate against the list of characters if required and
3364703203dSis * specific to UTF-8 and Unicode.
3374703203dSis */
3384703203dSis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)33985bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3404703203dSis {
3414703203dSis uchar_t *ib;
3424703203dSis uchar_t *ibtail;
3434703203dSis uchar_t **p;
3444703203dSis uchar_t *s1;
3454703203dSis uchar_t *s2;
3464703203dSis uchar_t f;
3474703203dSis int sz;
3484703203dSis size_t i;
3494703203dSis int ret_val;
3504703203dSis boolean_t second;
3514703203dSis boolean_t no_need_to_validate_entire;
3524703203dSis boolean_t check_additional;
3534703203dSis boolean_t validate_ucs2_range_only;
3544703203dSis
3554703203dSis if (! u8str)
3564703203dSis return (0);
3574703203dSis
3584703203dSis ib = (uchar_t *)u8str;
3594703203dSis ibtail = ib + n;
3604703203dSis
3614703203dSis ret_val = 0;
3624703203dSis
3634703203dSis no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3644703203dSis check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3654703203dSis validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3664703203dSis
3674703203dSis while (ib < ibtail) {
3684703203dSis /*
3694703203dSis * The first byte of a UTF-8 character tells how many
3704703203dSis * bytes will follow for the character. If the first byte
3714703203dSis * is an illegal byte value or out of range value, we just
3724703203dSis * return -1 with an appropriate error number.
3734703203dSis */
3744703203dSis sz = u8_number_of_bytes[*ib];
3754703203dSis if (sz == U8_ILLEGAL_CHAR) {
37685bb5f1dSis *errnum = EILSEQ;
3774703203dSis return (-1);
3784703203dSis }
3794703203dSis
3804703203dSis if (sz == U8_OUT_OF_RANGE_CHAR ||
3814703203dSis (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
38285bb5f1dSis *errnum = ERANGE;
3834703203dSis return (-1);
3844703203dSis }
3854703203dSis
3864703203dSis /*
3874703203dSis * If we don't have enough bytes to check on, that's also
3884703203dSis * an error. As you can see, we give illegal byte sequence
3894703203dSis * checking higher priority then EINVAL cases.
3904703203dSis */
3914703203dSis if ((ibtail - ib) < sz) {
39285bb5f1dSis *errnum = EINVAL;
3934703203dSis return (-1);
3944703203dSis }
3954703203dSis
3964703203dSis if (sz == 1) {
3974703203dSis ib++;
3984703203dSis ret_val++;
3994703203dSis } else {
4004703203dSis /*
4014703203dSis * Check on the multi-byte UTF-8 character. For more
4024703203dSis * details on this, see comment added for the used
4034703203dSis * data structures at the beginning of the file.
4044703203dSis */
4054703203dSis f = *ib++;
4064703203dSis ret_val++;
4074703203dSis second = B_TRUE;
4084703203dSis for (i = 1; i < sz; i++) {
4094703203dSis if (second) {
4104703203dSis if (*ib < u8_valid_min_2nd_byte[f] ||
4114703203dSis *ib > u8_valid_max_2nd_byte[f]) {
41285bb5f1dSis *errnum = EILSEQ;
4134703203dSis return (-1);
4144703203dSis }
4154703203dSis second = B_FALSE;
4164703203dSis } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
41785bb5f1dSis *errnum = EILSEQ;
4184703203dSis return (-1);
4194703203dSis }
4204703203dSis ib++;
4214703203dSis ret_val++;
4224703203dSis }
4234703203dSis }
4244703203dSis
4254703203dSis if (check_additional) {
4264703203dSis for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4274703203dSis s1 = ib - sz;
4284703203dSis s2 = p[i];
4294703203dSis while (s1 < ib) {
4304703203dSis if (*s1 != *s2 || *s2 == '\0')
4314703203dSis break;
4324703203dSis s1++;
4334703203dSis s2++;
4344703203dSis }
4354703203dSis
4364703203dSis if (s1 >= ib && *s2 == '\0') {
43785bb5f1dSis *errnum = EBADF;
4384703203dSis return (-1);
4394703203dSis }
4404703203dSis }
4414703203dSis }
4424703203dSis
4434703203dSis if (no_need_to_validate_entire)
4444703203dSis break;
4454703203dSis }
4464703203dSis
4474703203dSis return (ret_val);
4484703203dSis }
4494703203dSis
4504703203dSis /*
4514703203dSis * The do_case_conv() looks at the mapping tables and returns found
4524703203dSis * bytes if any. If not found, the input bytes are returned. The function
4534703203dSis * always terminate the return bytes with a null character assuming that
4544703203dSis * there are plenty of room to do so.
4554703203dSis *
4564703203dSis * The case conversions are simple case conversions mapping a character to
4574703203dSis * another character as specified in the Unicode data. The byte size of
4584703203dSis * the mapped character could be different from that of the input character.
4594703203dSis *
4604703203dSis * The return value is the byte length of the returned character excluding
4614703203dSis * the terminating null byte.
4624703203dSis */
4634703203dSis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4644703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4654703203dSis {
4664703203dSis size_t i;
4674703203dSis uint16_t b1 = 0;
4684703203dSis uint16_t b2 = 0;
4694703203dSis uint16_t b3 = 0;
4704703203dSis uint16_t b3_tbl;
4714703203dSis uint16_t b3_base;
4724703203dSis uint16_t b4 = 0;
4734703203dSis size_t start_id;
4744703203dSis size_t end_id;
4754703203dSis
4764703203dSis /*
4774703203dSis * At this point, the only possible values for sz are 2, 3, and 4.
4784703203dSis * The u8s should point to a vector that is well beyond the size of
4794703203dSis * 5 bytes.
4804703203dSis */
4814703203dSis if (sz == 2) {
4824703203dSis b3 = u8s[0] = s[0];
4834703203dSis b4 = u8s[1] = s[1];
4844703203dSis } else if (sz == 3) {
4854703203dSis b2 = u8s[0] = s[0];
4864703203dSis b3 = u8s[1] = s[1];
4874703203dSis b4 = u8s[2] = s[2];
4884703203dSis } else if (sz == 4) {
4894703203dSis b1 = u8s[0] = s[0];
4904703203dSis b2 = u8s[1] = s[1];
4914703203dSis b3 = u8s[2] = s[2];
4924703203dSis b4 = u8s[3] = s[3];
4934703203dSis } else {
4944703203dSis /* This is not possible but just in case as a fallback. */
4954703203dSis if (is_it_toupper)
4964703203dSis *u8s = U8_ASCII_TOUPPER(*s);
4974703203dSis else
4984703203dSis *u8s = U8_ASCII_TOLOWER(*s);
4994703203dSis u8s[1] = '\0';
5004703203dSis
5014703203dSis return (1);
5024703203dSis }
5034703203dSis u8s[sz] = '\0';
5044703203dSis
5054703203dSis /*
5064703203dSis * Let's find out if we have a corresponding character.
5074703203dSis */
5084703203dSis b1 = u8_common_b1_tbl[uv][b1];
5094703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5104703203dSis return ((size_t)sz);
5114703203dSis
5124703203dSis b2 = u8_case_common_b2_tbl[uv][b1][b2];
5134703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5144703203dSis return ((size_t)sz);
5154703203dSis
5164703203dSis if (is_it_toupper) {
5174703203dSis b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5184703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5194703203dSis return ((size_t)sz);
5204703203dSis
5214703203dSis start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5224703203dSis end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5234703203dSis
5244703203dSis /* Either there is no match or an error at the table. */
5254703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5264703203dSis return ((size_t)sz);
5274703203dSis
5284703203dSis b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5294703203dSis
5304703203dSis for (i = 0; start_id < end_id; start_id++)
5314703203dSis u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5324703203dSis } else {
5334703203dSis b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5344703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5354703203dSis return ((size_t)sz);
5364703203dSis
5374703203dSis start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5384703203dSis end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5394703203dSis
5404703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5414703203dSis return ((size_t)sz);
5424703203dSis
5434703203dSis b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5444703203dSis
5454703203dSis for (i = 0; start_id < end_id; start_id++)
5464703203dSis u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5474703203dSis }
5484703203dSis
5494703203dSis /*
5504703203dSis * If i is still zero, that means there is no corresponding character.
5514703203dSis */
5524703203dSis if (i == 0)
5534703203dSis return ((size_t)sz);
5544703203dSis
5554703203dSis u8s[i] = '\0';
5564703203dSis
5574703203dSis return (i);
5584703203dSis }
5594703203dSis
5604703203dSis /*
5614703203dSis * The do_case_compare() function compares the two input strings, s1 and s2,
5624703203dSis * one character at a time doing case conversions if applicable and return
5634703203dSis * the comparison result as like strcmp().
5644703203dSis *
5654703203dSis * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5664703203dSis * we treat the 7-bit ASCII characters as a special case trying to yield
5674703203dSis * faster processing time.
5684703203dSis */
5694703203dSis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5704703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
57185bb5f1dSis size_t n2, boolean_t is_it_toupper, int *errnum)
5724703203dSis {
5734703203dSis int f;
5744703203dSis int sz1;
5754703203dSis int sz2;
5764703203dSis size_t j;
5774703203dSis size_t i1;
5784703203dSis size_t i2;
5794703203dSis uchar_t u8s1[U8_MB_CUR_MAX + 1];
5804703203dSis uchar_t u8s2[U8_MB_CUR_MAX + 1];
5814703203dSis
5824703203dSis i1 = i2 = 0;
5834703203dSis while (i1 < n1 && i2 < n2) {
5844703203dSis /*
5854703203dSis * Find out what would be the byte length for this UTF-8
5864703203dSis * character at string s1 and also find out if this is
5874703203dSis * an illegal start byte or not and if so, issue a proper
58885bb5f1dSis * error number and yet treat this byte as a character.
5894703203dSis */
5904703203dSis sz1 = u8_number_of_bytes[*s1];
5914703203dSis if (sz1 < 0) {
59285bb5f1dSis *errnum = EILSEQ;
5934703203dSis sz1 = 1;
5944703203dSis }
5954703203dSis
5964703203dSis /*
5974703203dSis * For 7-bit ASCII characters mainly, we do a quick case
5984703203dSis * conversion right at here.
5994703203dSis *
6004703203dSis * If we don't have enough bytes for this character, issue
6014703203dSis * an EINVAL error and use what are available.
6024703203dSis *
6034703203dSis * If we have enough bytes, find out if there is
6044703203dSis * a corresponding uppercase character and if so, copy over
6054703203dSis * the bytes for a comparison later. If there is no
6064703203dSis * corresponding uppercase character, then, use what we have
6074703203dSis * for the comparison.
6084703203dSis */
6094703203dSis if (sz1 == 1) {
6104703203dSis if (is_it_toupper)
6114703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1);
6124703203dSis else
6134703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1);
6144703203dSis s1++;
6154703203dSis u8s1[1] = '\0';
6164703203dSis } else if ((i1 + sz1) > n1) {
61785bb5f1dSis *errnum = EINVAL;
6184703203dSis for (j = 0; (i1 + j) < n1; )
6194703203dSis u8s1[j++] = *s1++;
6204703203dSis u8s1[j] = '\0';
6214703203dSis } else {
6224703203dSis (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6234703203dSis s1 += sz1;
6244703203dSis }
6254703203dSis
6264703203dSis /* Do the same for the string s2. */
6274703203dSis sz2 = u8_number_of_bytes[*s2];
6284703203dSis if (sz2 < 0) {
62985bb5f1dSis *errnum = EILSEQ;
6304703203dSis sz2 = 1;
6314703203dSis }
6324703203dSis
6334703203dSis if (sz2 == 1) {
6344703203dSis if (is_it_toupper)
6354703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2);
6364703203dSis else
6374703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2);
6384703203dSis s2++;
6394703203dSis u8s2[1] = '\0';
6404703203dSis } else if ((i2 + sz2) > n2) {
64185bb5f1dSis *errnum = EINVAL;
6424703203dSis for (j = 0; (i2 + j) < n2; )
6434703203dSis u8s2[j++] = *s2++;
6444703203dSis u8s2[j] = '\0';
6454703203dSis } else {
6464703203dSis (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6474703203dSis s2 += sz2;
6484703203dSis }
6494703203dSis
6504703203dSis /* Now compare the two characters. */
6514703203dSis if (sz1 == 1 && sz2 == 1) {
6524703203dSis if (*u8s1 > *u8s2)
6534703203dSis return (1);
6544703203dSis if (*u8s1 < *u8s2)
6554703203dSis return (-1);
6564703203dSis } else {
6574703203dSis f = strcmp((const char *)u8s1, (const char *)u8s2);
6584703203dSis if (f != 0)
6594703203dSis return (f);
6604703203dSis }
6614703203dSis
6624703203dSis /*
6634703203dSis * They were the same. Let's move on to the next
6644703203dSis * characters then.
6654703203dSis */
6664703203dSis i1 += sz1;
6674703203dSis i2 += sz2;
6684703203dSis }
6694703203dSis
6704703203dSis /*
6714703203dSis * We compared until the end of either or both strings.
6724703203dSis *
6734703203dSis * If we reached to or went over the ends for the both, that means
6744703203dSis * they are the same.
6754703203dSis *
6764703203dSis * If we reached only one of the two ends, that means the other string
6774703203dSis * has something which then the fact can be used to determine
6784703203dSis * the return value.
6794703203dSis */
6804703203dSis if (i1 >= n1) {
6814703203dSis if (i2 >= n2)
6824703203dSis return (0);
6834703203dSis return (-1);
6844703203dSis }
6854703203dSis return (1);
6864703203dSis }
6874703203dSis
6884703203dSis /*
6894703203dSis * The combining_class() function checks on the given bytes and find out
6904703203dSis * the corresponding Unicode combining class value. The return value 0 means
6914703203dSis * it is a Starter. Any illegal UTF-8 character will also be treated as
6924703203dSis * a Starter.
6934703203dSis */
6944703203dSis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6954703203dSis combining_class(size_t uv, uchar_t *s, size_t sz)
6964703203dSis {
6974703203dSis uint16_t b1 = 0;
6984703203dSis uint16_t b2 = 0;
6994703203dSis uint16_t b3 = 0;
7004703203dSis uint16_t b4 = 0;
7014703203dSis
7024703203dSis if (sz == 1 || sz > 4)
7034703203dSis return (0);
7044703203dSis
7054703203dSis if (sz == 2) {
7064703203dSis b3 = s[0];
7074703203dSis b4 = s[1];
7084703203dSis } else if (sz == 3) {
7094703203dSis b2 = s[0];
7104703203dSis b3 = s[1];
7114703203dSis b4 = s[2];
7124703203dSis } else if (sz == 4) {
7134703203dSis b1 = s[0];
7144703203dSis b2 = s[1];
7154703203dSis b3 = s[2];
7164703203dSis b4 = s[3];
7174703203dSis }
7184703203dSis
7194703203dSis b1 = u8_common_b1_tbl[uv][b1];
7204703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7214703203dSis return (0);
7224703203dSis
7234703203dSis b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7244703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7254703203dSis return (0);
7264703203dSis
7274703203dSis b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7284703203dSis if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7294703203dSis return (0);
7304703203dSis
7314703203dSis return (u8_combining_class_b4_tbl[uv][b3][b4]);
7324703203dSis }
7334703203dSis
7344703203dSis /*
7354703203dSis * The do_decomp() function finds out a matching decomposition if any
7364703203dSis * and return. If there is no match, the input bytes are copied and returned.
7374703203dSis * The function also checks if there is a Hangul, decomposes it if necessary
7384703203dSis * and returns.
7394703203dSis *
7404703203dSis * To save time, a single byte 7-bit ASCII character should be handled by
7414703203dSis * the caller.
7424703203dSis *
7434703203dSis * The function returns the number of bytes returned sans always terminating
7444703203dSis * the null byte. It will also return a state that will tell if there was
7454703203dSis * a Hangul character decomposed which then will be used by the caller.
7464703203dSis */
7474703203dSis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7484703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7494703203dSis boolean_t canonical_decomposition, u8_normalization_states_t *state)
7504703203dSis {
7514703203dSis uint16_t b1 = 0;
7524703203dSis uint16_t b2 = 0;
7534703203dSis uint16_t b3 = 0;
7544703203dSis uint16_t b3_tbl;
7554703203dSis uint16_t b3_base;
7564703203dSis uint16_t b4 = 0;
7574703203dSis size_t start_id;
7584703203dSis size_t end_id;
7594703203dSis size_t i;
7604703203dSis uint32_t u1;
7614703203dSis
7624703203dSis if (sz == 2) {
7634703203dSis b3 = u8s[0] = s[0];
7644703203dSis b4 = u8s[1] = s[1];
7654703203dSis u8s[2] = '\0';
7664703203dSis } else if (sz == 3) {
7674703203dSis /* Convert it to a Unicode scalar value. */
7684703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7694703203dSis
7704703203dSis /*
7714703203dSis * If this is a Hangul syllable, we decompose it into
7724703203dSis * a leading consonant, a vowel, and an optional trailing
7734703203dSis * consonant and then return.
7744703203dSis */
7754703203dSis if (U8_HANGUL_SYLLABLE(u1)) {
7764703203dSis u1 -= U8_HANGUL_SYL_FIRST;
7774703203dSis
7784703203dSis b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7794703203dSis b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7804703203dSis / U8_HANGUL_T_COUNT;
7814703203dSis b3 = u1 % U8_HANGUL_T_COUNT;
7824703203dSis
7834703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7844703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7854703203dSis if (b3) {
7864703203dSis b3 += U8_HANGUL_JAMO_T_FIRST;
7874703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7884703203dSis
7894703203dSis u8s[9] = '\0';
7904703203dSis *state = U8_STATE_HANGUL_LVT;
7914703203dSis return (9);
7924703203dSis }
7934703203dSis
7944703203dSis u8s[6] = '\0';
7954703203dSis *state = U8_STATE_HANGUL_LV;
7964703203dSis return (6);
7974703203dSis }
7984703203dSis
7994703203dSis b2 = u8s[0] = s[0];
8004703203dSis b3 = u8s[1] = s[1];
8014703203dSis b4 = u8s[2] = s[2];
8024703203dSis u8s[3] = '\0';
8034703203dSis
8044703203dSis /*
8054703203dSis * If this is a Hangul Jamo, we know there is nothing
8064703203dSis * further that we can decompose.
8074703203dSis */
8084703203dSis if (U8_HANGUL_JAMO_L(u1)) {
8094703203dSis *state = U8_STATE_HANGUL_L;
8104703203dSis return (3);
8114703203dSis }
8124703203dSis
8134703203dSis if (U8_HANGUL_JAMO_V(u1)) {
8144703203dSis if (*state == U8_STATE_HANGUL_L)
8154703203dSis *state = U8_STATE_HANGUL_LV;
8164703203dSis else
8174703203dSis *state = U8_STATE_HANGUL_V;
8184703203dSis return (3);
8194703203dSis }
8204703203dSis
8214703203dSis if (U8_HANGUL_JAMO_T(u1)) {
8224703203dSis if (*state == U8_STATE_HANGUL_LV)
8234703203dSis *state = U8_STATE_HANGUL_LVT;
8244703203dSis else
8254703203dSis *state = U8_STATE_HANGUL_T;
8264703203dSis return (3);
8274703203dSis }
8284703203dSis } else if (sz == 4) {
8294703203dSis b1 = u8s[0] = s[0];
8304703203dSis b2 = u8s[1] = s[1];
8314703203dSis b3 = u8s[2] = s[2];
8324703203dSis b4 = u8s[3] = s[3];
8334703203dSis u8s[4] = '\0';
8344703203dSis } else {
8354703203dSis /*
8364703203dSis * This is a fallback and should not happen if the function
8374703203dSis * was called properly.
8384703203dSis */
8394703203dSis u8s[0] = s[0];
8404703203dSis u8s[1] = '\0';
8414703203dSis *state = U8_STATE_START;
8424703203dSis return (1);
8434703203dSis }
8444703203dSis
8454703203dSis /*
8464703203dSis * At this point, this rountine does not know what it would get.
8474703203dSis * The caller should sort it out if the state isn't a Hangul one.
8484703203dSis */
8494703203dSis *state = U8_STATE_START;
8504703203dSis
8514703203dSis /* Try to find matching decomposition mapping byte sequence. */
8524703203dSis b1 = u8_common_b1_tbl[uv][b1];
8534703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8544703203dSis return ((size_t)sz);
8554703203dSis
8564703203dSis b2 = u8_decomp_b2_tbl[uv][b1][b2];
8574703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8584703203dSis return ((size_t)sz);
8594703203dSis
8604703203dSis b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8614703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8624703203dSis return ((size_t)sz);
8634703203dSis
8644703203dSis /*
8654703203dSis * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8664703203dSis * which is 0x8000, this means we couldn't fit the mappings into
8674703203dSis * the cardinality of a unsigned byte.
8684703203dSis */
8694703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8704703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8714703203dSis start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8724703203dSis end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8734703203dSis } else {
8744703203dSis start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8754703203dSis end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8764703203dSis }
8774703203dSis
8784703203dSis /* This also means there wasn't any matching decomposition. */
8794703203dSis if (start_id >= end_id)
8804703203dSis return ((size_t)sz);
8814703203dSis
8824703203dSis /*
8834703203dSis * The final table for decomposition mappings has three types of
8844703203dSis * byte sequences depending on whether a mapping is for compatibility
8854703203dSis * decomposition, canonical decomposition, or both like the following:
8864703203dSis *
8874703203dSis * (1) Compatibility decomposition mappings:
8884703203dSis *
8894703203dSis * +---+---+-...-+---+
8904703203dSis * | B0| B1| ... | Bm|
8914703203dSis * +---+---+-...-+---+
8924703203dSis *
8934703203dSis * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8944703203dSis *
8954703203dSis * (2) Canonical decomposition mappings:
8964703203dSis *
8974703203dSis * +---+---+---+-...-+---+
8984703203dSis * | T | b0| b1| ... | bn|
8994703203dSis * +---+---+---+-...-+---+
9004703203dSis *
9014703203dSis * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9024703203dSis *
9034703203dSis * (3) Both mappings:
9044703203dSis *
9054703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+
9064703203dSis * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9074703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+
9084703203dSis *
9094703203dSis * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9104703203dSis * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9114703203dSis * compatibility mapping bytes.
9124703203dSis *
9134703203dSis * Note that compatibility decomposition means doing recursive
9144703203dSis * decompositions using both compatibility decomposition mappings and
9154703203dSis * canonical decomposition mappings. On the other hand, canonical
9164703203dSis * decomposition means doing recursive decompositions using only
9174703203dSis * canonical decomposition mappings. Since the table we have has gone
9184703203dSis * through the recursions already, we do not need to do so during
9194703203dSis * runtime, i.e., the table has been completely flattened out
9204703203dSis * already.
9214703203dSis */
9224703203dSis
9234703203dSis b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9244703203dSis
9254703203dSis /* Get the type, T, of the byte sequence. */
9264703203dSis b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9274703203dSis
9284703203dSis /*
9294703203dSis * If necessary, adjust start_id, end_id, or both. Note that if
9304703203dSis * this is compatibility decomposition mapping, there is no
9314703203dSis * adjustment.
9324703203dSis */
9334703203dSis if (canonical_decomposition) {
9344703203dSis /* Is the mapping only for compatibility decomposition? */
9354703203dSis if (b1 < U8_DECOMP_BOTH)
9364703203dSis return ((size_t)sz);
9374703203dSis
9384703203dSis start_id++;
9394703203dSis
9404703203dSis if (b1 == U8_DECOMP_BOTH) {
9414703203dSis end_id = start_id +
9424703203dSis u8_decomp_final_tbl[uv][b3_base + start_id];
9434703203dSis start_id++;
9444703203dSis }
9454703203dSis } else {
9464703203dSis /*
9474703203dSis * Unless this is a compatibility decomposition mapping,
9484703203dSis * we adjust the start_id.
9494703203dSis */
9504703203dSis if (b1 == U8_DECOMP_BOTH) {
9514703203dSis start_id++;
9524703203dSis start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9534703203dSis } else if (b1 == U8_DECOMP_CANONICAL) {
9544703203dSis start_id++;
9554703203dSis }
9564703203dSis }
9574703203dSis
9584703203dSis for (i = 0; start_id < end_id; start_id++)
9594703203dSis u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9604703203dSis u8s[i] = '\0';
9614703203dSis
9624703203dSis return (i);
9634703203dSis }
9644703203dSis
9654703203dSis /*
9664703203dSis * The find_composition_start() function uses the character bytes given and
9674703203dSis * find out the matching composition mappings if any and return the address
9684703203dSis * to the composition mappings as explained in the do_composition().
9694703203dSis */
9704703203dSis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9714703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9724703203dSis {
9734703203dSis uint16_t b1 = 0;
9744703203dSis uint16_t b2 = 0;
9754703203dSis uint16_t b3 = 0;
9764703203dSis uint16_t b3_tbl;
9774703203dSis uint16_t b3_base;
9784703203dSis uint16_t b4 = 0;
9794703203dSis size_t start_id;
9804703203dSis size_t end_id;
9814703203dSis
9824703203dSis if (sz == 1) {
9834703203dSis b4 = s[0];
9844703203dSis } else if (sz == 2) {
9854703203dSis b3 = s[0];
9864703203dSis b4 = s[1];
9874703203dSis } else if (sz == 3) {
9884703203dSis b2 = s[0];
9894703203dSis b3 = s[1];
9904703203dSis b4 = s[2];
9914703203dSis } else if (sz == 4) {
9924703203dSis b1 = s[0];
9934703203dSis b2 = s[1];
9944703203dSis b3 = s[2];
9954703203dSis b4 = s[3];
9964703203dSis } else {
9974703203dSis /*
9984703203dSis * This is a fallback and should not happen if the function
9994703203dSis * was called properly.
10004703203dSis */
10014703203dSis return (NULL);
10024703203dSis }
10034703203dSis
10044703203dSis b1 = u8_composition_b1_tbl[uv][b1];
10054703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10064703203dSis return (NULL);
10074703203dSis
10084703203dSis b2 = u8_composition_b2_tbl[uv][b1][b2];
10094703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10104703203dSis return (NULL);
10114703203dSis
10124703203dSis b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10134703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10144703203dSis return (NULL);
10154703203dSis
10164703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10174703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10184703203dSis start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10194703203dSis end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10204703203dSis } else {
10214703203dSis start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10224703203dSis end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10234703203dSis }
10244703203dSis
10254703203dSis if (start_id >= end_id)
10264703203dSis return (NULL);
10274703203dSis
10284703203dSis b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10294703203dSis
10304703203dSis return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10314703203dSis }
10324703203dSis
10334703203dSis /*
10344703203dSis * The blocked() function checks on the combining class values of previous
10354703203dSis * characters in this sequence and return whether it is blocked or not.
10364703203dSis */
10374703203dSis static boolean_t
blocked(uchar_t * comb_class,size_t last)10384703203dSis blocked(uchar_t *comb_class, size_t last)
10394703203dSis {
10404703203dSis uchar_t my_comb_class;
10414703203dSis size_t i;
10424703203dSis
10434703203dSis my_comb_class = comb_class[last];
10444703203dSis for (i = 1; i < last; i++)
10454703203dSis if (comb_class[i] >= my_comb_class ||
10464703203dSis comb_class[i] == U8_COMBINING_CLASS_STARTER)
10474703203dSis return (B_TRUE);
10484703203dSis
10494703203dSis return (B_FALSE);
10504703203dSis }
10514703203dSis
10524703203dSis /*
10534703203dSis * The do_composition() reads the character string pointed by 's' and
10544703203dSis * do necessary canonical composition and then copy over the result back to
10554703203dSis * the 's'.
10564703203dSis *
10574703203dSis * The input argument 's' cannot contain more than 32 characters.
10584703203dSis */
10594703203dSis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10604703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10614703203dSis uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10624703203dSis {
10634703203dSis uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10644703203dSis uchar_t tc[U8_MB_CUR_MAX];
10654703203dSis uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10664703203dSis size_t saved_marks_count;
10674703203dSis uchar_t *p;
10684703203dSis uchar_t *saved_p;
10694703203dSis uchar_t *q;
10704703203dSis size_t i;
10714703203dSis size_t saved_i;
10724703203dSis size_t j;
10734703203dSis size_t k;
10744703203dSis size_t l;
10754703203dSis size_t C;
10764703203dSis size_t saved_l;
10774703203dSis size_t size;
10784703203dSis uint32_t u1;
10794703203dSis uint32_t u2;
10804703203dSis boolean_t match_not_found = B_TRUE;
10814703203dSis
10824703203dSis /*
10834703203dSis * This should never happen unless the callers are doing some strange
10844703203dSis * and unexpected things.
10854703203dSis *
10864703203dSis * The "last" is the index pointing to the last character not last + 1.
10874703203dSis */
10884703203dSis if (last >= U8_MAX_CHARS_A_SEQ)
10894703203dSis last = U8_UPPER_LIMIT_IN_A_SEQ;
10904703203dSis
10914703203dSis for (i = l = 0; i <= last; i++) {
10924703203dSis /*
10934703203dSis * The last or any non-Starters at the beginning, we don't
10944703203dSis * have any chance to do composition and so we just copy them
10954703203dSis * to the temporary buffer.
10964703203dSis */
10974703203dSis if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10984703203dSis SAVE_THE_CHAR:
10994703203dSis p = s + start[i];
11004703203dSis size = disp[i];
11014703203dSis for (k = 0; k < size; k++)
11024703203dSis t[l++] = *p++;
11034703203dSis continue;
11044703203dSis }
11054703203dSis
11064703203dSis /*
11074703203dSis * If this could be a start of Hangul Jamos, then, we try to
11084703203dSis * conjoin them.
11094703203dSis */
11104703203dSis if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11114703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11124703203dSis s[start[i] + 1], s[start[i] + 2]);
11134703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11144703203dSis s[start[i] + 4], s[start[i] + 5]);
11154703203dSis
11164703203dSis if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11174703203dSis u1 -= U8_HANGUL_JAMO_L_FIRST;
11184703203dSis u2 -= U8_HANGUL_JAMO_V_FIRST;
11194703203dSis u1 = U8_HANGUL_SYL_FIRST +
11204703203dSis (u1 * U8_HANGUL_V_COUNT + u2) *
11214703203dSis U8_HANGUL_T_COUNT;
11224703203dSis
11234703203dSis i += 2;
11244703203dSis if (i <= last) {
11254703203dSis U8_PUT_3BYTES_INTO_UTF32(u2,
11264703203dSis s[start[i]], s[start[i] + 1],
11274703203dSis s[start[i] + 2]);
11284703203dSis
11294703203dSis if (U8_HANGUL_JAMO_T(u2)) {
11304703203dSis u1 += u2 -
11314703203dSis U8_HANGUL_JAMO_T_FIRST;
11324703203dSis i++;
11334703203dSis }
11344703203dSis }
11354703203dSis
11364703203dSis U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11374703203dSis i--;
11384703203dSis l += 3;
11394703203dSis continue;
11404703203dSis }
11414703203dSis }
11424703203dSis
11434703203dSis /*
11444703203dSis * Let's then find out if this Starter has composition
11454703203dSis * mapping.
11464703203dSis */
11474703203dSis p = find_composition_start(uv, s + start[i], disp[i]);
11484703203dSis if (p == NULL)
11494703203dSis goto SAVE_THE_CHAR;
11504703203dSis
11514703203dSis /*
11524703203dSis * We have a Starter with composition mapping and the next
11534703203dSis * character is a non-Starter. Let's try to find out if
11544703203dSis * we can do composition.
11554703203dSis */
11564703203dSis
11574703203dSis saved_p = p;
11584703203dSis saved_i = i;
11594703203dSis saved_l = l;
11604703203dSis saved_marks_count = 0;
11614703203dSis
11624703203dSis TRY_THE_NEXT_MARK:
11634703203dSis q = s + start[++i];
11644703203dSis size = disp[i];
11654703203dSis
11664703203dSis /*
11674703203dSis * The next for() loop compares the non-Starter pointed by
11684703203dSis * 'q' with the possible (joinable) characters pointed by 'p'.
11694703203dSis *
11704703203dSis * The composition final table entry pointed by the 'p'
11714703203dSis * looks like the following:
11724703203dSis *
11734703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11744703203dSis * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11754703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11764703203dSis *
11774703203dSis * where C is the count byte indicating the number of
11784703203dSis * mapping pairs where each pair would be look like
11794703203dSis * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11804703203dSis * character of a canonical decomposition and the B0-Bm are
11814703203dSis * the bytes of a matching composite character. The F is
11824703203dSis * a filler byte after each character as the separator.
11834703203dSis */
11844703203dSis
11854703203dSis match_not_found = B_TRUE;
11864703203dSis
11874703203dSis for (C = *p++; C > 0; C--) {
11884703203dSis for (k = 0; k < size; p++, k++)
11894703203dSis if (*p != q[k])
11904703203dSis break;
11914703203dSis
11924703203dSis /* Have we found it? */
11934703203dSis if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11944703203dSis match_not_found = B_FALSE;
11954703203dSis
11964703203dSis l = saved_l;
11974703203dSis
11984703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
11994703203dSis t[l++] = *p;
12004703203dSis
12014703203dSis break;
12024703203dSis }
12034703203dSis
12044703203dSis /* We didn't find; skip to the next pair. */
12054703203dSis if (*p != U8_TBL_ELEMENT_FILLER)
12064703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
12074703203dSis ;
12084703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
12094703203dSis ;
12104703203dSis p++;
12114703203dSis }
12124703203dSis
12134703203dSis /*
12144703203dSis * If there was no match, we will need to save the combining
12154703203dSis * mark for later appending. After that, if the next one
12164703203dSis * is a non-Starter and not blocked, then, we try once
12174703203dSis * again to do composition with the next non-Starter.
12184703203dSis *
12194703203dSis * If there was no match and this was a Starter, then,
12204703203dSis * this is a new start.
12214703203dSis *
12224703203dSis * If there was a match and a composition done and we have
12234703203dSis * more to check on, then, we retrieve a new composition final
12244703203dSis * table entry for the composite and then try to do the
12254703203dSis * composition again.
12264703203dSis */
12274703203dSis
12284703203dSis if (match_not_found) {
12294703203dSis if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12304703203dSis i--;
12314703203dSis goto SAVE_THE_CHAR;
12324703203dSis }
12334703203dSis
12344703203dSis saved_marks[saved_marks_count++] = i;
12354703203dSis }
12364703203dSis
12374703203dSis if (saved_l == l) {
12384703203dSis while (i < last) {
12394703203dSis if (blocked(comb_class, i + 1))
12404703203dSis saved_marks[saved_marks_count++] = ++i;
12414703203dSis else
12424703203dSis break;
12434703203dSis }
12444703203dSis if (i < last) {
12454703203dSis p = saved_p;
12464703203dSis goto TRY_THE_NEXT_MARK;
12474703203dSis }
12484703203dSis } else if (i < last) {
12494703203dSis p = find_composition_start(uv, t + saved_l,
12504703203dSis l - saved_l);
12514703203dSis if (p != NULL) {
12524703203dSis saved_p = p;
12534703203dSis goto TRY_THE_NEXT_MARK;
12544703203dSis }
12554703203dSis }
12564703203dSis
12574703203dSis /*
12584703203dSis * There is no more composition possible.
12594703203dSis *
12604703203dSis * If there was no composition what so ever then we copy
12614703203dSis * over the original Starter and then append any non-Starters
12624703203dSis * remaining at the target string sequentially after that.
12634703203dSis */
12644703203dSis
12654703203dSis if (saved_l == l) {
12664703203dSis p = s + start[saved_i];
12674703203dSis size = disp[saved_i];
12684703203dSis for (j = 0; j < size; j++)
12694703203dSis t[l++] = *p++;
12704703203dSis }
12714703203dSis
12724703203dSis for (k = 0; k < saved_marks_count; k++) {
12734703203dSis p = s + start[saved_marks[k]];
12744703203dSis size = disp[saved_marks[k]];
12754703203dSis for (j = 0; j < size; j++)
12764703203dSis t[l++] = *p++;
12774703203dSis }
12784703203dSis }
12794703203dSis
12804703203dSis /*
12814703203dSis * If the last character is a Starter and if we have a character
12824703203dSis * (possibly another Starter) that can be turned into a composite,
12834703203dSis * we do so and we do so until there is no more of composition
12844703203dSis * possible.
12854703203dSis */
12864703203dSis if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12874703203dSis p = *os;
12884703203dSis saved_l = l - disp[last];
12894703203dSis
12904703203dSis while (p < oslast) {
1291*f137b22eSDan McDonald int8_t number_of_bytes = u8_number_of_bytes[*p];
1292*f137b22eSDan McDonald
1293*f137b22eSDan McDonald if (number_of_bytes <= 1)
1294*f137b22eSDan McDonald break;
1295*f137b22eSDan McDonald size = number_of_bytes;
1296*f137b22eSDan McDonald if ((p + size) > oslast)
12974703203dSis break;
12984703203dSis
12994703203dSis saved_p = p;
13004703203dSis
13014703203dSis for (i = 0; i < size; i++)
13024703203dSis tc[i] = *p++;
13034703203dSis
13044703203dSis q = find_composition_start(uv, t + saved_l,
13054703203dSis l - saved_l);
13064703203dSis if (q == NULL) {
13074703203dSis p = saved_p;
13084703203dSis break;
13094703203dSis }
13104703203dSis
13114703203dSis match_not_found = B_TRUE;
13124703203dSis
13134703203dSis for (C = *q++; C > 0; C--) {
13144703203dSis for (k = 0; k < size; q++, k++)
13154703203dSis if (*q != tc[k])
13164703203dSis break;
13174703203dSis
13184703203dSis if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13194703203dSis match_not_found = B_FALSE;
13204703203dSis
13214703203dSis l = saved_l;
13224703203dSis
13234703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) {
13244703203dSis /*
13254703203dSis * This is practically
13264703203dSis * impossible but we don't
13274703203dSis * want to take any chances.
13284703203dSis */
13294703203dSis if (l >=
13304703203dSis U8_STREAM_SAFE_TEXT_MAX) {
13314703203dSis p = saved_p;
13324703203dSis goto SAFE_RETURN;
13334703203dSis }
13344703203dSis t[l++] = *q;
13354703203dSis }
13364703203dSis
13374703203dSis break;
13384703203dSis }
13394703203dSis
13404703203dSis if (*q != U8_TBL_ELEMENT_FILLER)
13414703203dSis while (*++q != U8_TBL_ELEMENT_FILLER)
13424703203dSis ;
13434703203dSis while (*++q != U8_TBL_ELEMENT_FILLER)
13444703203dSis ;
13454703203dSis q++;
13464703203dSis }
13474703203dSis
13484703203dSis if (match_not_found) {
13494703203dSis p = saved_p;
13504703203dSis break;
13514703203dSis }
13524703203dSis }
13534703203dSis SAFE_RETURN:
13544703203dSis *os = p;
13554703203dSis }
13564703203dSis
13574703203dSis /*
13584703203dSis * Now we copy over the temporary string to the target string.
13594703203dSis * Since composition always reduces the number of characters or
13604703203dSis * the number of characters stay, we don't need to worry about
13614703203dSis * the buffer overflow here.
13624703203dSis */
13634703203dSis for (i = 0; i < l; i++)
13644703203dSis s[i] = t[i];
13654703203dSis s[l] = '\0';
13664703203dSis
13674703203dSis return (l);
13684703203dSis }
13694703203dSis
13704703203dSis /*
13714703203dSis * The collect_a_seq() function checks on the given string s, collect
13724703203dSis * a sequence of characters at u8s, and return the sequence. While it collects
13734703203dSis * a sequence, it also applies case conversion, canonical or compatibility
13744703203dSis * decomposition, canonical decomposition, or some or all of them and
13754703203dSis * in that order.
13764703203dSis *
13774703203dSis * The collected sequence cannot be bigger than 32 characters since if
13784703203dSis * it is having more than 31 characters, the sequence will be terminated
13794703203dSis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13804703203dSis * a Stream-Safe Text. The collected sequence is always terminated with
13814703203dSis * a null byte and the return value is the byte length of the sequence
13824703203dSis * including 0. The return value does not include the terminating
13834703203dSis * null byte.
13844703203dSis */
13854703203dSis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13864703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13874703203dSis boolean_t is_it_toupper,
13884703203dSis boolean_t is_it_tolower,
13894703203dSis boolean_t canonical_decomposition,
13904703203dSis boolean_t compatibility_decomposition,
13914703203dSis boolean_t canonical_composition,
139285bb5f1dSis int *errnum, u8_normalization_states_t *state)
13934703203dSis {
13944703203dSis uchar_t *s;
13954703203dSis int sz;
13964703203dSis int saved_sz;
13974703203dSis size_t i;
13984703203dSis size_t j;
13994703203dSis size_t k;
14004703203dSis size_t l;
14014703203dSis uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
14024703203dSis uchar_t disp[U8_MAX_CHARS_A_SEQ];
14034703203dSis uchar_t start[U8_MAX_CHARS_A_SEQ];
14044703203dSis uchar_t u8t[U8_MB_CUR_MAX];
14054703203dSis uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14064703203dSis uchar_t tc;
14074703203dSis size_t last;
14084703203dSis size_t saved_last;
14094703203dSis uint32_t u1;
14104703203dSis
14114703203dSis /*
14124703203dSis * Save the source string pointer which we will return a changed
14134703203dSis * pointer if we do processing.
14144703203dSis */
14154703203dSis s = *source;
14164703203dSis
14174703203dSis /*
14184703203dSis * The following is a fallback for just in case callers are not
14194703203dSis * checking the string boundaries before the calling.
14204703203dSis */
14214703203dSis if (s >= slast) {
14224703203dSis u8s[0] = '\0';
14234703203dSis
14244703203dSis return (0);
14254703203dSis }
14264703203dSis
14274703203dSis /*
14284703203dSis * As the first thing, let's collect a character and do case
14294703203dSis * conversion if necessary.
14304703203dSis */
14314703203dSis
14324703203dSis sz = u8_number_of_bytes[*s];
14334703203dSis
14344703203dSis if (sz < 0) {
143585bb5f1dSis *errnum = EILSEQ;
14364703203dSis
14374703203dSis u8s[0] = *s++;
14384703203dSis u8s[1] = '\0';
14394703203dSis
14404703203dSis *source = s;
14414703203dSis
14424703203dSis return (1);
14434703203dSis }
14444703203dSis
14454703203dSis if (sz == 1) {
14464703203dSis if (is_it_toupper)
14474703203dSis u8s[0] = U8_ASCII_TOUPPER(*s);
14484703203dSis else if (is_it_tolower)
14494703203dSis u8s[0] = U8_ASCII_TOLOWER(*s);
14504703203dSis else
14514703203dSis u8s[0] = *s;
14524703203dSis s++;
14534703203dSis u8s[1] = '\0';
14544703203dSis } else if ((s + sz) > slast) {
145585bb5f1dSis *errnum = EINVAL;
14564703203dSis
14574703203dSis for (i = 0; s < slast; )
14584703203dSis u8s[i++] = *s++;
14594703203dSis u8s[i] = '\0';
14604703203dSis
14614703203dSis *source = s;
14624703203dSis
14634703203dSis return (i);
14644703203dSis } else {
14654703203dSis if (is_it_toupper || is_it_tolower) {
14664703203dSis i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14674703203dSis s += sz;
14684703203dSis sz = i;
14694703203dSis } else {
14704703203dSis for (i = 0; i < sz; )
14714703203dSis u8s[i++] = *s++;
14724703203dSis u8s[i] = '\0';
14734703203dSis }
14744703203dSis }
14754703203dSis
14764703203dSis /*
14774703203dSis * And then canonical/compatibility decomposition followed by
14784703203dSis * an optional canonical composition. Please be noted that
14794703203dSis * canonical composition is done only when a decomposition is
14804703203dSis * done.
14814703203dSis */
14824703203dSis if (canonical_decomposition || compatibility_decomposition) {
14834703203dSis if (sz == 1) {
14844703203dSis *state = U8_STATE_START;
14854703203dSis
14864703203dSis saved_sz = 1;
14874703203dSis
14884703203dSis comb_class[0] = 0;
14894703203dSis start[0] = 0;
14904703203dSis disp[0] = 1;
14914703203dSis
14924703203dSis last = 1;
14934703203dSis } else {
14944703203dSis saved_sz = do_decomp(uv, u8s, u8s, sz,
14954703203dSis canonical_decomposition, state);
14964703203dSis
14974703203dSis last = 0;
14984703203dSis
14994703203dSis for (i = 0; i < saved_sz; ) {
15004703203dSis sz = u8_number_of_bytes[u8s[i]];
15014703203dSis
15024703203dSis comb_class[last] = combining_class(uv,
15034703203dSis u8s + i, sz);
15044703203dSis start[last] = i;
15054703203dSis disp[last] = sz;
15064703203dSis
15074703203dSis last++;
15084703203dSis i += sz;
15094703203dSis }
15104703203dSis
15114703203dSis /*
15124703203dSis * Decomposition yields various Hangul related
15134703203dSis * states but not on combining marks. We need to
15144703203dSis * find out at here by checking on the last
15154703203dSis * character.
15164703203dSis */
15174703203dSis if (*state == U8_STATE_START) {
15184703203dSis if (comb_class[last - 1])
15194703203dSis *state = U8_STATE_COMBINING_MARK;
15204703203dSis }
15214703203dSis }
15224703203dSis
15234703203dSis saved_last = last;
15244703203dSis
15254703203dSis while (s < slast) {
15264703203dSis sz = u8_number_of_bytes[*s];
15274703203dSis
15284703203dSis /*
15294703203dSis * If this is an illegal character, an incomplete
15304703203dSis * character, or an 7-bit ASCII Starter character,
15314703203dSis * then we have collected a sequence; break and let
15324703203dSis * the next call deal with the two cases.
15334703203dSis *
15344703203dSis * Note that this is okay only if you are using this
15354703203dSis * function with a fixed length string, not on
15364703203dSis * a buffer with multiple calls of one chunk at a time.
15374703203dSis */
15384703203dSis if (sz <= 1) {
15394703203dSis break;
15404703203dSis } else if ((s + sz) > slast) {
15414703203dSis break;
15424703203dSis } else {
15434703203dSis /*
15444703203dSis * If the previous character was a Hangul Jamo
15454703203dSis * and this character is a Hangul Jamo that
15464703203dSis * can be conjoined, we collect the Jamo.
15474703203dSis */
15484703203dSis if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15494703203dSis U8_PUT_3BYTES_INTO_UTF32(u1,
15504703203dSis *s, *(s + 1), *(s + 2));
15514703203dSis
15524703203dSis if (U8_HANGUL_COMPOSABLE_L_V(*state,
15534703203dSis u1)) {
15544703203dSis i = 0;
15554703203dSis *state = U8_STATE_HANGUL_LV;
15564703203dSis goto COLLECT_A_HANGUL;
15574703203dSis }
15584703203dSis
15594703203dSis if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15604703203dSis u1)) {
15614703203dSis i = 0;
15624703203dSis *state = U8_STATE_HANGUL_LVT;
15634703203dSis goto COLLECT_A_HANGUL;
15644703203dSis }
15654703203dSis }
15664703203dSis
15674703203dSis /*
15684703203dSis * Regardless of whatever it was, if this is
15694703203dSis * a Starter, we don't collect the character
15704703203dSis * since that's a new start and we will deal
15714703203dSis * with it at the next time.
15724703203dSis */
15734703203dSis i = combining_class(uv, s, sz);
15744703203dSis if (i == U8_COMBINING_CLASS_STARTER)
15754703203dSis break;
15764703203dSis
15774703203dSis /*
15784703203dSis * We know the current character is a combining
15794703203dSis * mark. If the previous character wasn't
15804703203dSis * a Starter (not Hangul) or a combining mark,
15814703203dSis * then, we don't collect this combining mark.
15824703203dSis */
15834703203dSis if (*state != U8_STATE_START &&
15844703203dSis *state != U8_STATE_COMBINING_MARK)
15854703203dSis break;
15864703203dSis
15874703203dSis *state = U8_STATE_COMBINING_MARK;
15884703203dSis COLLECT_A_HANGUL:
15894703203dSis /*
15904703203dSis * If we collected a Starter and combining
15914703203dSis * marks up to 30, i.e., total 31 characters,
15924703203dSis * then, we terminate this degenerately long
15934703203dSis * combining sequence with a U+034F COMBINING
15944703203dSis * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15954703203dSis * UTF-8 and turn this into a Stream-Safe
15964703203dSis * Text. This will be extremely rare but
15974703203dSis * possible.
15984703203dSis *
15994703203dSis * The following will also guarantee that
16004703203dSis * we are not writing more than 32 characters
16014703203dSis * plus a NULL at u8s[].
16024703203dSis */
16034703203dSis if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
16044703203dSis TURN_STREAM_SAFE:
16054703203dSis *state = U8_STATE_START;
16064703203dSis comb_class[last] = 0;
16074703203dSis start[last] = saved_sz;
16084703203dSis disp[last] = 2;
16094703203dSis last++;
16104703203dSis
16114703203dSis u8s[saved_sz++] = 0xCD;
16124703203dSis u8s[saved_sz++] = 0x8F;
16134703203dSis
16144703203dSis break;
16154703203dSis }
16164703203dSis
16174703203dSis /*
16184703203dSis * Some combining marks also do decompose into
16194703203dSis * another combining mark or marks.
16204703203dSis */
16214703203dSis if (*state == U8_STATE_COMBINING_MARK) {
16224703203dSis k = last;
16234703203dSis l = sz;
16244703203dSis i = do_decomp(uv, uts, s, sz,
16254703203dSis canonical_decomposition, state);
16264703203dSis for (j = 0; j < i; ) {
16274703203dSis sz = u8_number_of_bytes[uts[j]];
16284703203dSis
16294703203dSis comb_class[last] =
16304703203dSis combining_class(uv,
16314703203dSis uts + j, sz);
16324703203dSis start[last] = saved_sz + j;
16334703203dSis disp[last] = sz;
16344703203dSis
16354703203dSis last++;
16364703203dSis if (last >=
16374703203dSis U8_UPPER_LIMIT_IN_A_SEQ) {
16384703203dSis last = k;
16394703203dSis goto TURN_STREAM_SAFE;
16404703203dSis }
16414703203dSis j += sz;
16424703203dSis }
16434703203dSis
16444703203dSis *state = U8_STATE_COMBINING_MARK;
16454703203dSis sz = i;
16464703203dSis s += l;
16474703203dSis
16484703203dSis for (i = 0; i < sz; i++)
16494703203dSis u8s[saved_sz++] = uts[i];
16504703203dSis } else {
16514703203dSis comb_class[last] = i;
16524703203dSis start[last] = saved_sz;
16534703203dSis disp[last] = sz;
16544703203dSis last++;
16554703203dSis
16564703203dSis for (i = 0; i < sz; i++)
16574703203dSis u8s[saved_sz++] = *s++;
16584703203dSis }
16594703203dSis
16604703203dSis /*
16614703203dSis * If this is U+0345 COMBINING GREEK
16624703203dSis * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16634703203dSis * iota subscript, and need to be converted to
16644703203dSis * uppercase letter, convert it to U+0399 GREEK
16654703203dSis * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16664703203dSis * i.e., convert to capital adscript form as
16674703203dSis * specified in the Unicode standard.
16684703203dSis *
16694703203dSis * This is the only special case of (ambiguous)
16704703203dSis * case conversion at combining marks and
16714703203dSis * probably the standard will never have
16724703203dSis * anything similar like this in future.
16734703203dSis */
16744703203dSis if (is_it_toupper && sz >= 2 &&
16754703203dSis u8s[saved_sz - 2] == 0xCD &&
16764703203dSis u8s[saved_sz - 1] == 0x85) {
16774703203dSis u8s[saved_sz - 2] = 0xCE;
16784703203dSis u8s[saved_sz - 1] = 0x99;
16794703203dSis }
16804703203dSis }
16814703203dSis }
16824703203dSis
16834703203dSis /*
16844703203dSis * Let's try to ensure a canonical ordering for the collected
16854703203dSis * combining marks. We do this only if we have collected
16864703203dSis * at least one more non-Starter. (The decomposition mapping
16874703203dSis * data tables have fully (and recursively) expanded and
16884703203dSis * canonically ordered decompositions.)
16894703203dSis *
16904703203dSis * The U8_SWAP_COMB_MARKS() convenience macro has some
16914703203dSis * assumptions and we are meeting the assumptions.
16924703203dSis */
16934703203dSis last--;
16944703203dSis if (last >= saved_last) {
16954703203dSis for (i = 0; i < last; i++)
16964703203dSis for (j = last; j > i; j--)
16974703203dSis if (comb_class[j] &&
16984703203dSis comb_class[j - 1] > comb_class[j]) {
16994703203dSis U8_SWAP_COMB_MARKS(j - 1, j);
17004703203dSis }
17014703203dSis }
17024703203dSis
17034703203dSis *source = s;
17044703203dSis
17054703203dSis if (! canonical_composition) {
17064703203dSis u8s[saved_sz] = '\0';
17074703203dSis return (saved_sz);
17084703203dSis }
17094703203dSis
17104703203dSis /*
17114703203dSis * Now do the canonical composition. Note that we do this
17124703203dSis * only after a canonical or compatibility decomposition to
17134703203dSis * finish up NFC or NFKC.
17144703203dSis */
17154703203dSis sz = do_composition(uv, u8s, comb_class, start, disp, last,
17164703203dSis &s, slast);
17174703203dSis }
17184703203dSis
17194703203dSis *source = s;
17204703203dSis
17214703203dSis return ((size_t)sz);
17224703203dSis }
17234703203dSis
17244703203dSis /*
17254703203dSis * The do_norm_compare() function does string comparion based on Unicode
17264703203dSis * simple case mappings and Unicode Normalization definitions.
17274703203dSis *
17284703203dSis * It does so by collecting a sequence of character at a time and comparing
17294703203dSis * the collected sequences from the strings.
17304703203dSis *
17314703203dSis * The meanings on the return values are the same as the usual strcmp().
17324703203dSis */
17334703203dSis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17344703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
173585bb5f1dSis int flag, int *errnum)
17364703203dSis {
17374703203dSis int result;
17384703203dSis size_t sz1;
17394703203dSis size_t sz2;
17404703203dSis uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17414703203dSis uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17424703203dSis uchar_t *s1last;
17434703203dSis uchar_t *s2last;
17444703203dSis boolean_t is_it_toupper;
17454703203dSis boolean_t is_it_tolower;
17464703203dSis boolean_t canonical_decomposition;
17474703203dSis boolean_t compatibility_decomposition;
17484703203dSis boolean_t canonical_composition;
17494703203dSis u8_normalization_states_t state;
17504703203dSis
17514703203dSis s1last = s1 + n1;
17524703203dSis s2last = s2 + n2;
17534703203dSis
17544703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17554703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17564703203dSis canonical_decomposition = flag & U8_CANON_DECOMP;
17574703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17584703203dSis canonical_composition = flag & U8_CANON_COMP;
17594703203dSis
17604703203dSis while (s1 < s1last && s2 < s2last) {
17614703203dSis /*
17624703203dSis * If the current character is a 7-bit ASCII and the last
17634703203dSis * character, or, if the current character and the next
17644703203dSis * character are both some 7-bit ASCII characters then
17654703203dSis * we treat the current character as a sequence.
17664703203dSis *
17674703203dSis * In any other cases, we need to call collect_a_seq().
17684703203dSis */
17694703203dSis
17704703203dSis if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17714703203dSis ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17724703203dSis if (is_it_toupper)
17734703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1);
17744703203dSis else if (is_it_tolower)
17754703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1);
17764703203dSis else
17774703203dSis u8s1[0] = *s1;
17784703203dSis u8s1[1] = '\0';
17794703203dSis sz1 = 1;
17804703203dSis s1++;
17814703203dSis } else {
17824703203dSis state = U8_STATE_START;
17834703203dSis sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17844703203dSis is_it_toupper, is_it_tolower,
17854703203dSis canonical_decomposition,
17864703203dSis compatibility_decomposition,
178785bb5f1dSis canonical_composition, errnum, &state);
17884703203dSis }
17894703203dSis
17904703203dSis if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17914703203dSis ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17924703203dSis if (is_it_toupper)
17934703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2);
17944703203dSis else if (is_it_tolower)
17954703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2);
17964703203dSis else
17974703203dSis u8s2[0] = *s2;
17984703203dSis u8s2[1] = '\0';
17994703203dSis sz2 = 1;
18004703203dSis s2++;
18014703203dSis } else {
18024703203dSis state = U8_STATE_START;
18034703203dSis sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
18044703203dSis is_it_toupper, is_it_tolower,
18054703203dSis canonical_decomposition,
18064703203dSis compatibility_decomposition,
180785bb5f1dSis canonical_composition, errnum, &state);
18084703203dSis }
18094703203dSis
18104703203dSis /*
18114703203dSis * Now compare the two characters. If they are the same,
18124703203dSis * we move on to the next character sequences.
18134703203dSis */
18144703203dSis if (sz1 == 1 && sz2 == 1) {
18154703203dSis if (*u8s1 > *u8s2)
18164703203dSis return (1);
18174703203dSis if (*u8s1 < *u8s2)
18184703203dSis return (-1);
18194703203dSis } else {
18204703203dSis result = strcmp((const char *)u8s1, (const char *)u8s2);
18214703203dSis if (result != 0)
18224703203dSis return (result);
18234703203dSis }
18244703203dSis }
18254703203dSis
18264703203dSis /*
18274703203dSis * We compared until the end of either or both strings.
18284703203dSis *
18294703203dSis * If we reached to or went over the ends for the both, that means
18304703203dSis * they are the same.
18314703203dSis *
18324703203dSis * If we reached only one end, that means the other string has
18334703203dSis * something which then can be used to determine the return value.
18344703203dSis */
18354703203dSis if (s1 >= s1last) {
18364703203dSis if (s2 >= s2last)
18374703203dSis return (0);
18384703203dSis return (-1);
18394703203dSis }
18404703203dSis return (1);
18414703203dSis }
18424703203dSis
18434703203dSis /*
18444703203dSis * The u8_strcmp() function compares two UTF-8 strings quite similar to
18454703203dSis * the strcmp(). For the comparison, however, Unicode Normalization specific
18464703203dSis * equivalency and Unicode simple case conversion mappings based equivalency
18474703203dSis * can be requested and checked against.
18484703203dSis */
18494703203dSis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18504703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
185185bb5f1dSis int *errnum)
18524703203dSis {
18534703203dSis int f;
18544703203dSis size_t n1;
18554703203dSis size_t n2;
18564703203dSis
185785bb5f1dSis *errnum = 0;
18584703203dSis
18594703203dSis /*
18604703203dSis * Check on the requested Unicode version, case conversion, and
18614703203dSis * normalization flag values.
18624703203dSis */
18634703203dSis
18644703203dSis if (uv > U8_UNICODE_LATEST) {
186585bb5f1dSis *errnum = ERANGE;
18664703203dSis uv = U8_UNICODE_LATEST;
18674703203dSis }
18684703203dSis
18694703203dSis if (flag == 0) {
18704703203dSis flag = U8_STRCMP_CS;
18714703203dSis } else {
18724703203dSis f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18734703203dSis U8_STRCMP_CI_LOWER);
18744703203dSis if (f == 0) {
18754703203dSis flag |= U8_STRCMP_CS;
18764703203dSis } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18774703203dSis f != U8_STRCMP_CI_LOWER) {
187885bb5f1dSis *errnum = EBADF;
18794703203dSis flag = U8_STRCMP_CS;
18804703203dSis }
18814703203dSis
18824703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18834703203dSis if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18844703203dSis f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
188585bb5f1dSis *errnum = EBADF;
18864703203dSis flag = U8_STRCMP_CS;
18874703203dSis }
18884703203dSis }
18894703203dSis
18904703203dSis if (flag == U8_STRCMP_CS) {
18914703203dSis return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18924703203dSis }
18934703203dSis
18944703203dSis n1 = strlen(s1);
18954703203dSis n2 = strlen(s2);
18964703203dSis if (n != 0) {
18974703203dSis if (n < n1)
18984703203dSis n1 = n;
18994703203dSis if (n < n2)
19004703203dSis n2 = n;
19014703203dSis }
19024703203dSis
19034703203dSis /*
19044703203dSis * Simple case conversion can be done much faster and so we do
19054703203dSis * them separately here.
19064703203dSis */
19074703203dSis if (flag == U8_STRCMP_CI_UPPER) {
19084703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
190985bb5f1dSis n1, n2, B_TRUE, errnum));
19104703203dSis } else if (flag == U8_STRCMP_CI_LOWER) {
19114703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
191285bb5f1dSis n1, n2, B_FALSE, errnum));
19134703203dSis }
19144703203dSis
19154703203dSis return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
191685bb5f1dSis flag, errnum));
19174703203dSis }
19184703203dSis
19194703203dSis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19204703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
192185bb5f1dSis int flag, size_t unicode_version, int *errnum)
19224703203dSis {
19234703203dSis int f;
19244703203dSis int sz;
19254703203dSis uchar_t *ib;
19264703203dSis uchar_t *ibtail;
19274703203dSis uchar_t *ob;
19284703203dSis uchar_t *obtail;
19294703203dSis boolean_t do_not_ignore_null;
19304703203dSis boolean_t do_not_ignore_invalid;
19314703203dSis boolean_t is_it_toupper;
19324703203dSis boolean_t is_it_tolower;
19334703203dSis boolean_t canonical_decomposition;
19344703203dSis boolean_t compatibility_decomposition;
19354703203dSis boolean_t canonical_composition;
19364703203dSis size_t ret_val;
19374703203dSis size_t i;
19384703203dSis size_t j;
19394703203dSis uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19404703203dSis u8_normalization_states_t state;
19414703203dSis
19424703203dSis if (unicode_version > U8_UNICODE_LATEST) {
194385bb5f1dSis *errnum = ERANGE;
19444703203dSis return ((size_t)-1);
19454703203dSis }
19464703203dSis
19474703203dSis f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19484703203dSis if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
194985bb5f1dSis *errnum = EBADF;
19504703203dSis return ((size_t)-1);
19514703203dSis }
19524703203dSis
19534703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19544703203dSis if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19554703203dSis f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
195685bb5f1dSis *errnum = EBADF;
19574703203dSis return ((size_t)-1);
19584703203dSis }
19594703203dSis
19604703203dSis if (inarray == NULL || *inlen == 0)
19614703203dSis return (0);
19624703203dSis
19634703203dSis if (outarray == NULL) {
196485bb5f1dSis *errnum = E2BIG;
19654703203dSis return ((size_t)-1);
19664703203dSis }
19674703203dSis
19684703203dSis ib = (uchar_t *)inarray;
19694703203dSis ob = (uchar_t *)outarray;
19704703203dSis ibtail = ib + *inlen;
19714703203dSis obtail = ob + *outlen;
19724703203dSis
19734703203dSis do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19744703203dSis do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19754703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19764703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19774703203dSis
19784703203dSis ret_val = 0;
19794703203dSis
19804703203dSis /*
19814703203dSis * If we don't have a normalization flag set, we do the simple case
19824703203dSis * conversion based text preparation separately below. Text
19834703203dSis * preparation involving Normalization will be done in the false task
19844703203dSis * block, again, separately since it will take much more time and
19854703203dSis * resource than doing simple case conversions.
19864703203dSis */
19874703203dSis if (f == 0) {
19884703203dSis while (ib < ibtail) {
19894703203dSis if (*ib == '\0' && do_not_ignore_null)
19904703203dSis break;
19914703203dSis
19924703203dSis sz = u8_number_of_bytes[*ib];
19934703203dSis
19944703203dSis if (sz < 0) {
19954703203dSis if (do_not_ignore_invalid) {
199685bb5f1dSis *errnum = EILSEQ;
19974703203dSis ret_val = (size_t)-1;
19984703203dSis break;
19994703203dSis }
20004703203dSis
20014703203dSis sz = 1;
20024703203dSis ret_val++;
20034703203dSis }
20044703203dSis
20054703203dSis if (sz == 1) {
20064703203dSis if (ob >= obtail) {
200785bb5f1dSis *errnum = E2BIG;
20084703203dSis ret_val = (size_t)-1;
20094703203dSis break;
20104703203dSis }
20114703203dSis
20124703203dSis if (is_it_toupper)
20134703203dSis *ob = U8_ASCII_TOUPPER(*ib);
20144703203dSis else if (is_it_tolower)
20154703203dSis *ob = U8_ASCII_TOLOWER(*ib);
20164703203dSis else
20174703203dSis *ob = *ib;
20184703203dSis ib++;
20194703203dSis ob++;
20204703203dSis } else if ((ib + sz) > ibtail) {
20214703203dSis if (do_not_ignore_invalid) {
202285bb5f1dSis *errnum = EINVAL;
20234703203dSis ret_val = (size_t)-1;
20244703203dSis break;
20254703203dSis }
20264703203dSis
20274703203dSis if ((obtail - ob) < (ibtail - ib)) {
202885bb5f1dSis *errnum = E2BIG;
20294703203dSis ret_val = (size_t)-1;
20304703203dSis break;
20314703203dSis }
20324703203dSis
20334703203dSis /*
20344703203dSis * We treat the remaining incomplete character
20354703203dSis * bytes as a character.
20364703203dSis */
20374703203dSis ret_val++;
20384703203dSis
20394703203dSis while (ib < ibtail)
20404703203dSis *ob++ = *ib++;
20414703203dSis } else {
20424703203dSis if (is_it_toupper || is_it_tolower) {
20434703203dSis i = do_case_conv(unicode_version, u8s,
20444703203dSis ib, sz, is_it_toupper);
20454703203dSis
20464703203dSis if ((obtail - ob) < i) {
204785bb5f1dSis *errnum = E2BIG;
20484703203dSis ret_val = (size_t)-1;
20494703203dSis break;
20504703203dSis }
20514703203dSis
20524703203dSis ib += sz;
20534703203dSis
20544703203dSis for (sz = 0; sz < i; sz++)
20554703203dSis *ob++ = u8s[sz];
20564703203dSis } else {
20574703203dSis if ((obtail - ob) < sz) {
205885bb5f1dSis *errnum = E2BIG;
20594703203dSis ret_val = (size_t)-1;
20604703203dSis break;
20614703203dSis }
20624703203dSis
20634703203dSis for (i = 0; i < sz; i++)
20644703203dSis *ob++ = *ib++;
20654703203dSis }
20664703203dSis }
20674703203dSis }
20684703203dSis } else {
20694703203dSis canonical_decomposition = flag & U8_CANON_DECOMP;
20704703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20714703203dSis canonical_composition = flag & U8_CANON_COMP;
20724703203dSis
20734703203dSis while (ib < ibtail) {
20744703203dSis if (*ib == '\0' && do_not_ignore_null)
20754703203dSis break;
20764703203dSis
20774703203dSis /*
20784703203dSis * If the current character is a 7-bit ASCII
20794703203dSis * character and it is the last character, or,
20804703203dSis * if the current character is a 7-bit ASCII
20814703203dSis * character and the next character is also a 7-bit
20824703203dSis * ASCII character, then, we copy over this
20834703203dSis * character without going through collect_a_seq().
20844703203dSis *
20854703203dSis * In any other cases, we need to look further with
20864703203dSis * the collect_a_seq() function.
20874703203dSis */
20884703203dSis if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20894703203dSis ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20904703203dSis if (ob >= obtail) {
209185bb5f1dSis *errnum = E2BIG;
20924703203dSis ret_val = (size_t)-1;
20934703203dSis break;
20944703203dSis }
20954703203dSis
20964703203dSis if (is_it_toupper)
20974703203dSis *ob = U8_ASCII_TOUPPER(*ib);
20984703203dSis else if (is_it_tolower)
20994703203dSis *ob = U8_ASCII_TOLOWER(*ib);
21004703203dSis else
21014703203dSis *ob = *ib;
21024703203dSis ib++;
21034703203dSis ob++;
21044703203dSis } else {
210585bb5f1dSis *errnum = 0;
21064703203dSis state = U8_STATE_START;
21074703203dSis
21084703203dSis j = collect_a_seq(unicode_version, u8s,
21094703203dSis &ib, ibtail,
21104703203dSis is_it_toupper,
21114703203dSis is_it_tolower,
21124703203dSis canonical_decomposition,
21134703203dSis compatibility_decomposition,
21144703203dSis canonical_composition,
211585bb5f1dSis errnum, &state);
21164703203dSis
211785bb5f1dSis if (*errnum && do_not_ignore_invalid) {
21184703203dSis ret_val = (size_t)-1;
21194703203dSis break;
21204703203dSis }
21214703203dSis
21224703203dSis if ((obtail - ob) < j) {
212385bb5f1dSis *errnum = E2BIG;
21244703203dSis ret_val = (size_t)-1;
21254703203dSis break;
21264703203dSis }
21274703203dSis
21284703203dSis for (i = 0; i < j; i++)
21294703203dSis *ob++ = u8s[i];
21304703203dSis }
21314703203dSis }
21324703203dSis }
21334703203dSis
21344703203dSis *inlen = ibtail - ib;
21354703203dSis *outlen = obtail - ob;
21364703203dSis
21374703203dSis return (ret_val);
21384703203dSis }
2139