15049Sis /*
25049Sis * CDDL HEADER START
35049Sis *
45049Sis * The contents of this file are subject to the terms of the
55049Sis * Common Development and Distribution License (the "License").
65049Sis * You may not use this file except in compliance with the License.
75049Sis *
85049Sis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95049Sis * or http://www.opensolaris.org/os/licensing.
105049Sis * See the License for the specific language governing permissions
115049Sis * and limitations under the License.
125049Sis *
135049Sis * When distributing Covered Code, include this CDDL HEADER in each
145049Sis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155049Sis * If applicable, add the following below this CDDL HEADER, with the
165049Sis * fields enclosed by brackets "[]" replaced with your own identifying
175049Sis * information: Portions Copyright [yyyy] [name of copyright owner]
185049Sis *
195049Sis * CDDL HEADER END
205049Sis */
215049Sis /*
22*7012Sis * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
235049Sis * Use is subject to license terms.
245049Sis */
255049Sis
265049Sis #pragma ident "%Z%%M% %I% %E% SMI"
275049Sis
285049Sis
295049Sis /*
305049Sis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
315049Sis *
325049Sis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
335049Sis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
345049Sis * the section 3C man pages.
355049Sis * Interface stability: Committed.
365049Sis */
375049Sis
385049Sis #include <sys/types.h>
395049Sis #ifdef _KERNEL
405049Sis #include <sys/param.h>
415049Sis #include <sys/sysmacros.h>
425049Sis #include <sys/systm.h>
435049Sis #include <sys/debug.h>
445049Sis #include <sys/kmem.h>
455049Sis #include <sys/ddi.h>
465049Sis #include <sys/sunddi.h>
475049Sis #else
485049Sis #include <sys/u8_textprep.h>
495049Sis #include <strings.h>
505049Sis #endif /* _KERNEL */
515049Sis #include <sys/byteorder.h>
525049Sis #include <sys/errno.h>
535049Sis #include <sys/u8_textprep_data.h>
545049Sis
555049Sis
565049Sis /* The maximum possible number of bytes in a UTF-8 character. */
575049Sis #define U8_MB_CUR_MAX (4)
585049Sis
595049Sis /*
605049Sis * The maximum number of bytes needed for a UTF-8 character to cover
615049Sis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
625049Sis */
635049Sis #define U8_MAX_BYTES_UCS2 (3)
645049Sis
655049Sis /* The maximum possible number of bytes in a Stream-Safe Text. */
665049Sis #define U8_STREAM_SAFE_TEXT_MAX (128)
675049Sis
685049Sis /*
695049Sis * The maximum number of characters in a combining/conjoining sequence and
705049Sis * the actual upperbound limit of a combining/conjoining sequence.
715049Sis */
725049Sis #define U8_MAX_CHARS_A_SEQ (32)
735049Sis #define U8_UPPER_LIMIT_IN_A_SEQ (31)
745049Sis
755049Sis /* The combining class value for Starter. */
765049Sis #define U8_COMBINING_CLASS_STARTER (0)
775049Sis
785049Sis /*
795049Sis * Some Hangul related macros at below.
805049Sis *
815049Sis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
825049Sis * Vowels, and optional Trailing consonants in Unicode scalar values.
835049Sis *
845049Sis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
855049Sis * the actual U+11A8. This is due to that the trailing consonant is optional
865049Sis * and thus we are doing a pre-calculation of subtracting one.
875049Sis *
885049Sis * Each of 19 modern leading consonants has total 588 possible syllables since
895049Sis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
905049Sis * no trailing consonant case, i.e., 21 x 28 = 588.
915049Sis *
925049Sis * We also have bunch of Hangul related macros at below. Please bear in mind
935049Sis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
945049Sis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
955049Sis * Jamo; it just guarantee that it will be most likely.
965049Sis */
975049Sis #define U8_HANGUL_SYL_FIRST (0xAC00U)
985049Sis #define U8_HANGUL_SYL_LAST (0xD7A3U)
995049Sis
1005049Sis #define U8_HANGUL_JAMO_L_FIRST (0x1100U)
1015049Sis #define U8_HANGUL_JAMO_L_LAST (0x1112U)
1025049Sis #define U8_HANGUL_JAMO_V_FIRST (0x1161U)
1035049Sis #define U8_HANGUL_JAMO_V_LAST (0x1175U)
1045049Sis #define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
1055049Sis #define U8_HANGUL_JAMO_T_LAST (0x11C2U)
1065049Sis
1075049Sis #define U8_HANGUL_V_COUNT (21)
1085049Sis #define U8_HANGUL_VT_COUNT (588)
1095049Sis #define U8_HANGUL_T_COUNT (28)
1105049Sis
1115049Sis #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
1125049Sis
1135049Sis #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1145049Sis (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1155049Sis (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1165049Sis (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1175049Sis
1185049Sis #define U8_HANGUL_JAMO_L(u) \
1195049Sis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1205049Sis
1215049Sis #define U8_HANGUL_JAMO_V(u) \
1225049Sis ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1235049Sis
1245049Sis #define U8_HANGUL_JAMO_T(u) \
1255049Sis ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1265049Sis
1275049Sis #define U8_HANGUL_JAMO(u) \
1285049Sis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1295049Sis
1305049Sis #define U8_HANGUL_SYLLABLE(u) \
1315049Sis ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1325049Sis
1335049Sis #define U8_HANGUL_COMPOSABLE_L_V(s, u) \
1345049Sis ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1355049Sis
1365049Sis #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1375049Sis ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1385049Sis
1395049Sis /* The types of decomposition mappings. */
1405049Sis #define U8_DECOMP_BOTH (0xF5U)
1415049Sis #define U8_DECOMP_CANONICAL (0xF6U)
1425049Sis
1435049Sis /* The indicator for 16-bit table. */
1445049Sis #define U8_16BIT_TABLE_INDICATOR (0x8000U)
1455049Sis
1465049Sis /* The following are some convenience macros. */
1475049Sis #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1485049Sis (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1495049Sis (uint32_t)(b3) & 0x3F;
1505049Sis
1515049Sis #define U8_SIMPLE_SWAP(a, b, t) \
1525049Sis (t) = (a); \
1535049Sis (a) = (b); \
1545049Sis (b) = (t);
1555049Sis
1565049Sis #define U8_ASCII_TOUPPER(c) \
1575049Sis (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1585049Sis
1595049Sis #define U8_ASCII_TOLOWER(c) \
1605049Sis (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1615049Sis
1625049Sis #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
1635049Sis /*
1645049Sis * The following macro assumes that the two characters that are to be
1655049Sis * swapped are adjacent to each other and 'a' comes before 'b'.
1665049Sis *
1675049Sis * If the assumptions are not met, then, the macro will fail.
1685049Sis */
1695049Sis #define U8_SWAP_COMB_MARKS(a, b) \
1705049Sis for (k = 0; k < disp[(a)]; k++) \
1715049Sis u8t[k] = u8s[start[(a)] + k]; \
1725049Sis for (k = 0; k < disp[(b)]; k++) \
1735049Sis u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1745049Sis start[(b)] = start[(a)] + disp[(b)]; \
1755049Sis for (k = 0; k < disp[(a)]; k++) \
1765049Sis u8s[start[(b)] + k] = u8t[k]; \
1775049Sis U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1785049Sis U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1795049Sis
1805049Sis /* The possible states during normalization. */
1815049Sis typedef enum {
1825049Sis U8_STATE_START = 0,
1835049Sis U8_STATE_HANGUL_L = 1,
1845049Sis U8_STATE_HANGUL_LV = 2,
1855049Sis U8_STATE_HANGUL_LVT = 3,
1865049Sis U8_STATE_HANGUL_V = 4,
1875049Sis U8_STATE_HANGUL_T = 5,
1885049Sis U8_STATE_COMBINING_MARK = 6
1895049Sis } u8_normalization_states_t;
1905049Sis
1915049Sis /*
1925049Sis * The three vectors at below are used to check bytes of a given UTF-8
1935049Sis * character are valid and not containing any malformed byte values.
1945049Sis *
1955049Sis * We used to have a quite relaxed UTF-8 binary representation but then there
1965049Sis * was some security related issues and so the Unicode Consortium defined
1975049Sis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
1985049Sis * one more time at the Unicode 3.2. The following three tables are based on
1995049Sis * that.
2005049Sis */
2015049Sis
2025049Sis #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
2035049Sis
2045049Sis #define I_ U8_ILLEGAL_CHAR
2055049Sis #define O_ U8_OUT_OF_RANGE_CHAR
2065049Sis
2075049Sis const int8_t u8_number_of_bytes[0x100] = {
2085049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2095049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2105049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2115049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2125049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2135049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2145049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155049Sis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2165049Sis
2175049Sis /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
2185049Sis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2195049Sis
2205049Sis /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
2215049Sis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2225049Sis
2235049Sis /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
2245049Sis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2255049Sis
2265049Sis /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
2275049Sis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2285049Sis
2295049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
2305049Sis I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2315049Sis
2325049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
2335049Sis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2345049Sis
2355049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
2365049Sis 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2375049Sis
2385049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
2395049Sis 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2405049Sis };
2415049Sis
2425049Sis #undef I_
2435049Sis #undef O_
2445049Sis
2455049Sis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2465049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2475049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2485049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2495049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2505049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2515049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2525049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2535049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2545049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2555049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2565049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2575049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2585049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2595049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2605049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2615049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2625049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2635049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2645049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2655049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2665049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2675049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2685049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2695049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2705049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2715049Sis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2725049Sis /* C8 C9 CA CB CC CD CE CF */
2735049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2745049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2755049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2765049Sis /* D8 D9 DA DB DC DD DE DF */
2775049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2785049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2795049Sis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2805049Sis /* E8 E9 EA EB EC ED EE EF */
2815049Sis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2825049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2835049Sis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
2845049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2855049Sis };
2865049Sis
2875049Sis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2885049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2895049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2905049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2915049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2925049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2935049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2945049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2955049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2965049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2975049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2985049Sis 0, 0, 0, 0, 0, 0, 0, 0,
2995049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3005049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3015049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3025049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3035049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3045049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3055049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3065049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3075049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3085049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3095049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3105049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3115049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3125049Sis /* C0 C1 C2 C3 C4 C5 C6 C7 */
3135049Sis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3145049Sis /* C8 C9 CA CB CC CD CE CF */
3155049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3165049Sis /* D0 D1 D2 D3 D4 D5 D6 D7 */
3175049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3185049Sis /* D8 D9 DA DB DC DD DE DF */
3195049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3205049Sis /* E0 E1 E2 E3 E4 E5 E6 E7 */
3215049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3225049Sis /* E8 E9 EA EB EC ED EE EF */
3235049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3245049Sis /* F0 F1 F2 F3 F4 F5 F6 F7 */
3255049Sis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
3265049Sis 0, 0, 0, 0, 0, 0, 0, 0,
3275049Sis };
3285049Sis
3295049Sis
3305049Sis /*
3315049Sis * The u8_validate() validates on the given UTF-8 character string and
3325049Sis * calculate the byte length. It is quite similar to mblen(3C) except that
3335049Sis * this will validate against the list of characters if required and
3345049Sis * specific to UTF-8 and Unicode.
3355049Sis */
3365049Sis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)337*7012Sis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3385049Sis {
3395049Sis uchar_t *ib;
3405049Sis uchar_t *ibtail;
3415049Sis uchar_t **p;
3425049Sis uchar_t *s1;
3435049Sis uchar_t *s2;
3445049Sis uchar_t f;
3455049Sis int sz;
3465049Sis size_t i;
3475049Sis int ret_val;
3485049Sis boolean_t second;
3495049Sis boolean_t no_need_to_validate_entire;
3505049Sis boolean_t check_additional;
3515049Sis boolean_t validate_ucs2_range_only;
3525049Sis
3535049Sis if (! u8str)
3545049Sis return (0);
3555049Sis
3565049Sis ib = (uchar_t *)u8str;
3575049Sis ibtail = ib + n;
3585049Sis
3595049Sis ret_val = 0;
3605049Sis
3615049Sis no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3625049Sis check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3635049Sis validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3645049Sis
3655049Sis while (ib < ibtail) {
3665049Sis /*
3675049Sis * The first byte of a UTF-8 character tells how many
3685049Sis * bytes will follow for the character. If the first byte
3695049Sis * is an illegal byte value or out of range value, we just
3705049Sis * return -1 with an appropriate error number.
3715049Sis */
3725049Sis sz = u8_number_of_bytes[*ib];
3735049Sis if (sz == U8_ILLEGAL_CHAR) {
374*7012Sis *errnum = EILSEQ;
3755049Sis return (-1);
3765049Sis }
3775049Sis
3785049Sis if (sz == U8_OUT_OF_RANGE_CHAR ||
3795049Sis (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*7012Sis *errnum = ERANGE;
3815049Sis return (-1);
3825049Sis }
3835049Sis
3845049Sis /*
3855049Sis * If we don't have enough bytes to check on, that's also
3865049Sis * an error. As you can see, we give illegal byte sequence
3875049Sis * checking higher priority then EINVAL cases.
3885049Sis */
3895049Sis if ((ibtail - ib) < sz) {
390*7012Sis *errnum = EINVAL;
3915049Sis return (-1);
3925049Sis }
3935049Sis
3945049Sis if (sz == 1) {
3955049Sis ib++;
3965049Sis ret_val++;
3975049Sis } else {
3985049Sis /*
3995049Sis * Check on the multi-byte UTF-8 character. For more
4005049Sis * details on this, see comment added for the used
4015049Sis * data structures at the beginning of the file.
4025049Sis */
4035049Sis f = *ib++;
4045049Sis ret_val++;
4055049Sis second = B_TRUE;
4065049Sis for (i = 1; i < sz; i++) {
4075049Sis if (second) {
4085049Sis if (*ib < u8_valid_min_2nd_byte[f] ||
4095049Sis *ib > u8_valid_max_2nd_byte[f]) {
410*7012Sis *errnum = EILSEQ;
4115049Sis return (-1);
4125049Sis }
4135049Sis second = B_FALSE;
4145049Sis } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*7012Sis *errnum = EILSEQ;
4165049Sis return (-1);
4175049Sis }
4185049Sis ib++;
4195049Sis ret_val++;
4205049Sis }
4215049Sis }
4225049Sis
4235049Sis if (check_additional) {
4245049Sis for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4255049Sis s1 = ib - sz;
4265049Sis s2 = p[i];
4275049Sis while (s1 < ib) {
4285049Sis if (*s1 != *s2 || *s2 == '\0')
4295049Sis break;
4305049Sis s1++;
4315049Sis s2++;
4325049Sis }
4335049Sis
4345049Sis if (s1 >= ib && *s2 == '\0') {
435*7012Sis *errnum = EBADF;
4365049Sis return (-1);
4375049Sis }
4385049Sis }
4395049Sis }
4405049Sis
4415049Sis if (no_need_to_validate_entire)
4425049Sis break;
4435049Sis }
4445049Sis
4455049Sis return (ret_val);
4465049Sis }
4475049Sis
4485049Sis /*
4495049Sis * The do_case_conv() looks at the mapping tables and returns found
4505049Sis * bytes if any. If not found, the input bytes are returned. The function
4515049Sis * always terminate the return bytes with a null character assuming that
4525049Sis * there are plenty of room to do so.
4535049Sis *
4545049Sis * The case conversions are simple case conversions mapping a character to
4555049Sis * another character as specified in the Unicode data. The byte size of
4565049Sis * the mapped character could be different from that of the input character.
4575049Sis *
4585049Sis * The return value is the byte length of the returned character excluding
4595049Sis * the terminating null byte.
4605049Sis */
4615049Sis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4625049Sis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4635049Sis {
4645049Sis size_t i;
4655049Sis uint16_t b1 = 0;
4665049Sis uint16_t b2 = 0;
4675049Sis uint16_t b3 = 0;
4685049Sis uint16_t b3_tbl;
4695049Sis uint16_t b3_base;
4705049Sis uint16_t b4 = 0;
4715049Sis size_t start_id;
4725049Sis size_t end_id;
4735049Sis
4745049Sis /*
4755049Sis * At this point, the only possible values for sz are 2, 3, and 4.
4765049Sis * The u8s should point to a vector that is well beyond the size of
4775049Sis * 5 bytes.
4785049Sis */
4795049Sis if (sz == 2) {
4805049Sis b3 = u8s[0] = s[0];
4815049Sis b4 = u8s[1] = s[1];
4825049Sis } else if (sz == 3) {
4835049Sis b2 = u8s[0] = s[0];
4845049Sis b3 = u8s[1] = s[1];
4855049Sis b4 = u8s[2] = s[2];
4865049Sis } else if (sz == 4) {
4875049Sis b1 = u8s[0] = s[0];
4885049Sis b2 = u8s[1] = s[1];
4895049Sis b3 = u8s[2] = s[2];
4905049Sis b4 = u8s[3] = s[3];
4915049Sis } else {
4925049Sis /* This is not possible but just in case as a fallback. */
4935049Sis if (is_it_toupper)
4945049Sis *u8s = U8_ASCII_TOUPPER(*s);
4955049Sis else
4965049Sis *u8s = U8_ASCII_TOLOWER(*s);
4975049Sis u8s[1] = '\0';
4985049Sis
4995049Sis return (1);
5005049Sis }
5015049Sis u8s[sz] = '\0';
5025049Sis
5035049Sis /*
5045049Sis * Let's find out if we have a corresponding character.
5055049Sis */
5065049Sis b1 = u8_common_b1_tbl[uv][b1];
5075049Sis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5085049Sis return ((size_t)sz);
5095049Sis
5105049Sis b2 = u8_case_common_b2_tbl[uv][b1][b2];
5115049Sis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5125049Sis return ((size_t)sz);
5135049Sis
5145049Sis if (is_it_toupper) {
5155049Sis b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5165049Sis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5175049Sis return ((size_t)sz);
5185049Sis
5195049Sis start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5205049Sis end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5215049Sis
5225049Sis /* Either there is no match or an error at the table. */
5235049Sis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5245049Sis return ((size_t)sz);
5255049Sis
5265049Sis b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5275049Sis
5285049Sis for (i = 0; start_id < end_id; start_id++)
5295049Sis u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5305049Sis } else {
5315049Sis b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5325049Sis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5335049Sis return ((size_t)sz);
5345049Sis
5355049Sis start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5365049Sis end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5375049Sis
5385049Sis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5395049Sis return ((size_t)sz);
5405049Sis
5415049Sis b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5425049Sis
5435049Sis for (i = 0; start_id < end_id; start_id++)
5445049Sis u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5455049Sis }
5465049Sis
5475049Sis /*
5485049Sis * If i is still zero, that means there is no corresponding character.
5495049Sis */
5505049Sis if (i == 0)
5515049Sis return ((size_t)sz);
5525049Sis
5535049Sis u8s[i] = '\0';
5545049Sis
5555049Sis return (i);
5565049Sis }
5575049Sis
5585049Sis /*
5595049Sis * The do_case_compare() function compares the two input strings, s1 and s2,
5605049Sis * one character at a time doing case conversions if applicable and return
5615049Sis * the comparison result as like strcmp().
5625049Sis *
5635049Sis * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5645049Sis * we treat the 7-bit ASCII characters as a special case trying to yield
5655049Sis * faster processing time.
5665049Sis */
5675049Sis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5685049Sis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
569*7012Sis size_t n2, boolean_t is_it_toupper, int *errnum)
5705049Sis {
5715049Sis int f;
5725049Sis int sz1;
5735049Sis int sz2;
5745049Sis size_t j;
5755049Sis size_t i1;
5765049Sis size_t i2;
5775049Sis uchar_t u8s1[U8_MB_CUR_MAX + 1];
5785049Sis uchar_t u8s2[U8_MB_CUR_MAX + 1];
5795049Sis
5805049Sis i1 = i2 = 0;
5815049Sis while (i1 < n1 && i2 < n2) {
5825049Sis /*
5835049Sis * Find out what would be the byte length for this UTF-8
5845049Sis * character at string s1 and also find out if this is
5855049Sis * an illegal start byte or not and if so, issue a proper
586*7012Sis * error number and yet treat this byte as a character.
5875049Sis */
5885049Sis sz1 = u8_number_of_bytes[*s1];
5895049Sis if (sz1 < 0) {
590*7012Sis *errnum = EILSEQ;
5915049Sis sz1 = 1;
5925049Sis }
5935049Sis
5945049Sis /*
5955049Sis * For 7-bit ASCII characters mainly, we do a quick case
5965049Sis * conversion right at here.
5975049Sis *
5985049Sis * If we don't have enough bytes for this character, issue
5995049Sis * an EINVAL error and use what are available.
6005049Sis *
6015049Sis * If we have enough bytes, find out if there is
6025049Sis * a corresponding uppercase character and if so, copy over
6035049Sis * the bytes for a comparison later. If there is no
6045049Sis * corresponding uppercase character, then, use what we have
6055049Sis * for the comparison.
6065049Sis */
6075049Sis if (sz1 == 1) {
6085049Sis if (is_it_toupper)
6095049Sis u8s1[0] = U8_ASCII_TOUPPER(*s1);
6105049Sis else
6115049Sis u8s1[0] = U8_ASCII_TOLOWER(*s1);
6125049Sis s1++;
6135049Sis u8s1[1] = '\0';
6145049Sis } else if ((i1 + sz1) > n1) {
615*7012Sis *errnum = EINVAL;
6165049Sis for (j = 0; (i1 + j) < n1; )
6175049Sis u8s1[j++] = *s1++;
6185049Sis u8s1[j] = '\0';
6195049Sis } else {
6205049Sis (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6215049Sis s1 += sz1;
6225049Sis }
6235049Sis
6245049Sis /* Do the same for the string s2. */
6255049Sis sz2 = u8_number_of_bytes[*s2];
6265049Sis if (sz2 < 0) {
627*7012Sis *errnum = EILSEQ;
6285049Sis sz2 = 1;
6295049Sis }
6305049Sis
6315049Sis if (sz2 == 1) {
6325049Sis if (is_it_toupper)
6335049Sis u8s2[0] = U8_ASCII_TOUPPER(*s2);
6345049Sis else
6355049Sis u8s2[0] = U8_ASCII_TOLOWER(*s2);
6365049Sis s2++;
6375049Sis u8s2[1] = '\0';
6385049Sis } else if ((i2 + sz2) > n2) {
639*7012Sis *errnum = EINVAL;
6405049Sis for (j = 0; (i2 + j) < n2; )
6415049Sis u8s2[j++] = *s2++;
6425049Sis u8s2[j] = '\0';
6435049Sis } else {
6445049Sis (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6455049Sis s2 += sz2;
6465049Sis }
6475049Sis
6485049Sis /* Now compare the two characters. */
6495049Sis if (sz1 == 1 && sz2 == 1) {
6505049Sis if (*u8s1 > *u8s2)
6515049Sis return (1);
6525049Sis if (*u8s1 < *u8s2)
6535049Sis return (-1);
6545049Sis } else {
6555049Sis f = strcmp((const char *)u8s1, (const char *)u8s2);
6565049Sis if (f != 0)
6575049Sis return (f);
6585049Sis }
6595049Sis
6605049Sis /*
6615049Sis * They were the same. Let's move on to the next
6625049Sis * characters then.
6635049Sis */
6645049Sis i1 += sz1;
6655049Sis i2 += sz2;
6665049Sis }
6675049Sis
6685049Sis /*
6695049Sis * We compared until the end of either or both strings.
6705049Sis *
6715049Sis * If we reached to or went over the ends for the both, that means
6725049Sis * they are the same.
6735049Sis *
6745049Sis * If we reached only one of the two ends, that means the other string
6755049Sis * has something which then the fact can be used to determine
6765049Sis * the return value.
6775049Sis */
6785049Sis if (i1 >= n1) {
6795049Sis if (i2 >= n2)
6805049Sis return (0);
6815049Sis return (-1);
6825049Sis }
6835049Sis return (1);
6845049Sis }
6855049Sis
6865049Sis /*
6875049Sis * The combining_class() function checks on the given bytes and find out
6885049Sis * the corresponding Unicode combining class value. The return value 0 means
6895049Sis * it is a Starter. Any illegal UTF-8 character will also be treated as
6905049Sis * a Starter.
6915049Sis */
6925049Sis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6935049Sis combining_class(size_t uv, uchar_t *s, size_t sz)
6945049Sis {
6955049Sis uint16_t b1 = 0;
6965049Sis uint16_t b2 = 0;
6975049Sis uint16_t b3 = 0;
6985049Sis uint16_t b4 = 0;
6995049Sis
7005049Sis if (sz == 1 || sz > 4)
7015049Sis return (0);
7025049Sis
7035049Sis if (sz == 2) {
7045049Sis b3 = s[0];
7055049Sis b4 = s[1];
7065049Sis } else if (sz == 3) {
7075049Sis b2 = s[0];
7085049Sis b3 = s[1];
7095049Sis b4 = s[2];
7105049Sis } else if (sz == 4) {
7115049Sis b1 = s[0];
7125049Sis b2 = s[1];
7135049Sis b3 = s[2];
7145049Sis b4 = s[3];
7155049Sis }
7165049Sis
7175049Sis b1 = u8_common_b1_tbl[uv][b1];
7185049Sis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7195049Sis return (0);
7205049Sis
7215049Sis b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7225049Sis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7235049Sis return (0);
7245049Sis
7255049Sis b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7265049Sis if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7275049Sis return (0);
7285049Sis
7295049Sis return (u8_combining_class_b4_tbl[uv][b3][b4]);
7305049Sis }
7315049Sis
7325049Sis /*
7335049Sis * The do_decomp() function finds out a matching decomposition if any
7345049Sis * and return. If there is no match, the input bytes are copied and returned.
7355049Sis * The function also checks if there is a Hangul, decomposes it if necessary
7365049Sis * and returns.
7375049Sis *
7385049Sis * To save time, a single byte 7-bit ASCII character should be handled by
7395049Sis * the caller.
7405049Sis *
7415049Sis * The function returns the number of bytes returned sans always terminating
7425049Sis * the null byte. It will also return a state that will tell if there was
7435049Sis * a Hangul character decomposed which then will be used by the caller.
7445049Sis */
7455049Sis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7465049Sis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7475049Sis boolean_t canonical_decomposition, u8_normalization_states_t *state)
7485049Sis {
7495049Sis uint16_t b1 = 0;
7505049Sis uint16_t b2 = 0;
7515049Sis uint16_t b3 = 0;
7525049Sis uint16_t b3_tbl;
7535049Sis uint16_t b3_base;
7545049Sis uint16_t b4 = 0;
7555049Sis size_t start_id;
7565049Sis size_t end_id;
7575049Sis size_t i;
7585049Sis uint32_t u1;
7595049Sis
7605049Sis if (sz == 2) {
7615049Sis b3 = u8s[0] = s[0];
7625049Sis b4 = u8s[1] = s[1];
7635049Sis u8s[2] = '\0';
7645049Sis } else if (sz == 3) {
7655049Sis /* Convert it to a Unicode scalar value. */
7665049Sis U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7675049Sis
7685049Sis /*
7695049Sis * If this is a Hangul syllable, we decompose it into
7705049Sis * a leading consonant, a vowel, and an optional trailing
7715049Sis * consonant and then return.
7725049Sis */
7735049Sis if (U8_HANGUL_SYLLABLE(u1)) {
7745049Sis u1 -= U8_HANGUL_SYL_FIRST;
7755049Sis
7765049Sis b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7775049Sis b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7785049Sis / U8_HANGUL_T_COUNT;
7795049Sis b3 = u1 % U8_HANGUL_T_COUNT;
7805049Sis
7815049Sis U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7825049Sis U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7835049Sis if (b3) {
7845049Sis b3 += U8_HANGUL_JAMO_T_FIRST;
7855049Sis U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7865049Sis
7875049Sis u8s[9] = '\0';
7885049Sis *state = U8_STATE_HANGUL_LVT;
7895049Sis return (9);
7905049Sis }
7915049Sis
7925049Sis u8s[6] = '\0';
7935049Sis *state = U8_STATE_HANGUL_LV;
7945049Sis return (6);
7955049Sis }
7965049Sis
7975049Sis b2 = u8s[0] = s[0];
7985049Sis b3 = u8s[1] = s[1];
7995049Sis b4 = u8s[2] = s[2];
8005049Sis u8s[3] = '\0';
8015049Sis
8025049Sis /*
8035049Sis * If this is a Hangul Jamo, we know there is nothing
8045049Sis * further that we can decompose.
8055049Sis */
8065049Sis if (U8_HANGUL_JAMO_L(u1)) {
8075049Sis *state = U8_STATE_HANGUL_L;
8085049Sis return (3);
8095049Sis }
8105049Sis
8115049Sis if (U8_HANGUL_JAMO_V(u1)) {
8125049Sis if (*state == U8_STATE_HANGUL_L)
8135049Sis *state = U8_STATE_HANGUL_LV;
8145049Sis else
8155049Sis *state = U8_STATE_HANGUL_V;
8165049Sis return (3);
8175049Sis }
8185049Sis
8195049Sis if (U8_HANGUL_JAMO_T(u1)) {
8205049Sis if (*state == U8_STATE_HANGUL_LV)
8215049Sis *state = U8_STATE_HANGUL_LVT;
8225049Sis else
8235049Sis *state = U8_STATE_HANGUL_T;
8245049Sis return (3);
8255049Sis }
8265049Sis } else if (sz == 4) {
8275049Sis b1 = u8s[0] = s[0];
8285049Sis b2 = u8s[1] = s[1];
8295049Sis b3 = u8s[2] = s[2];
8305049Sis b4 = u8s[3] = s[3];
8315049Sis u8s[4] = '\0';
8325049Sis } else {
8335049Sis /*
8345049Sis * This is a fallback and should not happen if the function
8355049Sis * was called properly.
8365049Sis */
8375049Sis u8s[0] = s[0];
8385049Sis u8s[1] = '\0';
8395049Sis *state = U8_STATE_START;
8405049Sis return (1);
8415049Sis }
8425049Sis
8435049Sis /*
8445049Sis * At this point, this rountine does not know what it would get.
8455049Sis * The caller should sort it out if the state isn't a Hangul one.
8465049Sis */
8475049Sis *state = U8_STATE_START;
8485049Sis
8495049Sis /* Try to find matching decomposition mapping byte sequence. */
8505049Sis b1 = u8_common_b1_tbl[uv][b1];
8515049Sis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8525049Sis return ((size_t)sz);
8535049Sis
8545049Sis b2 = u8_decomp_b2_tbl[uv][b1][b2];
8555049Sis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8565049Sis return ((size_t)sz);
8575049Sis
8585049Sis b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8595049Sis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8605049Sis return ((size_t)sz);
8615049Sis
8625049Sis /*
8635049Sis * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8645049Sis * which is 0x8000, this means we couldn't fit the mappings into
8655049Sis * the cardinality of a unsigned byte.
8665049Sis */
8675049Sis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8685049Sis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8695049Sis start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8705049Sis end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8715049Sis } else {
8725049Sis start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8735049Sis end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8745049Sis }
8755049Sis
8765049Sis /* This also means there wasn't any matching decomposition. */
8775049Sis if (start_id >= end_id)
8785049Sis return ((size_t)sz);
8795049Sis
8805049Sis /*
8815049Sis * The final table for decomposition mappings has three types of
8825049Sis * byte sequences depending on whether a mapping is for compatibility
8835049Sis * decomposition, canonical decomposition, or both like the following:
8845049Sis *
8855049Sis * (1) Compatibility decomposition mappings:
8865049Sis *
8875049Sis * +---+---+-...-+---+
8885049Sis * | B0| B1| ... | Bm|
8895049Sis * +---+---+-...-+---+
8905049Sis *
8915049Sis * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8925049Sis *
8935049Sis * (2) Canonical decomposition mappings:
8945049Sis *
8955049Sis * +---+---+---+-...-+---+
8965049Sis * | T | b0| b1| ... | bn|
8975049Sis * +---+---+---+-...-+---+
8985049Sis *
8995049Sis * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9005049Sis *
9015049Sis * (3) Both mappings:
9025049Sis *
9035049Sis * +---+---+---+---+-...-+---+---+---+-...-+---+
9045049Sis * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9055049Sis * +---+---+---+---+-...-+---+---+---+-...-+---+
9065049Sis *
9075049Sis * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9085049Sis * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9095049Sis * compatibility mapping bytes.
9105049Sis *
9115049Sis * Note that compatibility decomposition means doing recursive
9125049Sis * decompositions using both compatibility decomposition mappings and
9135049Sis * canonical decomposition mappings. On the other hand, canonical
9145049Sis * decomposition means doing recursive decompositions using only
9155049Sis * canonical decomposition mappings. Since the table we have has gone
9165049Sis * through the recursions already, we do not need to do so during
9175049Sis * runtime, i.e., the table has been completely flattened out
9185049Sis * already.
9195049Sis */
9205049Sis
9215049Sis b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9225049Sis
9235049Sis /* Get the type, T, of the byte sequence. */
9245049Sis b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9255049Sis
9265049Sis /*
9275049Sis * If necessary, adjust start_id, end_id, or both. Note that if
9285049Sis * this is compatibility decomposition mapping, there is no
9295049Sis * adjustment.
9305049Sis */
9315049Sis if (canonical_decomposition) {
9325049Sis /* Is the mapping only for compatibility decomposition? */
9335049Sis if (b1 < U8_DECOMP_BOTH)
9345049Sis return ((size_t)sz);
9355049Sis
9365049Sis start_id++;
9375049Sis
9385049Sis if (b1 == U8_DECOMP_BOTH) {
9395049Sis end_id = start_id +
9405049Sis u8_decomp_final_tbl[uv][b3_base + start_id];
9415049Sis start_id++;
9425049Sis }
9435049Sis } else {
9445049Sis /*
9455049Sis * Unless this is a compatibility decomposition mapping,
9465049Sis * we adjust the start_id.
9475049Sis */
9485049Sis if (b1 == U8_DECOMP_BOTH) {
9495049Sis start_id++;
9505049Sis start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9515049Sis } else if (b1 == U8_DECOMP_CANONICAL) {
9525049Sis start_id++;
9535049Sis }
9545049Sis }
9555049Sis
9565049Sis for (i = 0; start_id < end_id; start_id++)
9575049Sis u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9585049Sis u8s[i] = '\0';
9595049Sis
9605049Sis return (i);
9615049Sis }
9625049Sis
9635049Sis /*
9645049Sis * The find_composition_start() function uses the character bytes given and
9655049Sis * find out the matching composition mappings if any and return the address
9665049Sis * to the composition mappings as explained in the do_composition().
9675049Sis */
9685049Sis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9695049Sis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9705049Sis {
9715049Sis uint16_t b1 = 0;
9725049Sis uint16_t b2 = 0;
9735049Sis uint16_t b3 = 0;
9745049Sis uint16_t b3_tbl;
9755049Sis uint16_t b3_base;
9765049Sis uint16_t b4 = 0;
9775049Sis size_t start_id;
9785049Sis size_t end_id;
9795049Sis
9805049Sis if (sz == 1) {
9815049Sis b4 = s[0];
9825049Sis } else if (sz == 2) {
9835049Sis b3 = s[0];
9845049Sis b4 = s[1];
9855049Sis } else if (sz == 3) {
9865049Sis b2 = s[0];
9875049Sis b3 = s[1];
9885049Sis b4 = s[2];
9895049Sis } else if (sz == 4) {
9905049Sis b1 = s[0];
9915049Sis b2 = s[1];
9925049Sis b3 = s[2];
9935049Sis b4 = s[3];
9945049Sis } else {
9955049Sis /*
9965049Sis * This is a fallback and should not happen if the function
9975049Sis * was called properly.
9985049Sis */
9995049Sis return (NULL);
10005049Sis }
10015049Sis
10025049Sis b1 = u8_composition_b1_tbl[uv][b1];
10035049Sis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10045049Sis return (NULL);
10055049Sis
10065049Sis b2 = u8_composition_b2_tbl[uv][b1][b2];
10075049Sis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10085049Sis return (NULL);
10095049Sis
10105049Sis b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10115049Sis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10125049Sis return (NULL);
10135049Sis
10145049Sis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10155049Sis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10165049Sis start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10175049Sis end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10185049Sis } else {
10195049Sis start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10205049Sis end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10215049Sis }
10225049Sis
10235049Sis if (start_id >= end_id)
10245049Sis return (NULL);
10255049Sis
10265049Sis b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10275049Sis
10285049Sis return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10295049Sis }
10305049Sis
10315049Sis /*
10325049Sis * The blocked() function checks on the combining class values of previous
10335049Sis * characters in this sequence and return whether it is blocked or not.
10345049Sis */
10355049Sis static boolean_t
blocked(uchar_t * comb_class,size_t last)10365049Sis blocked(uchar_t *comb_class, size_t last)
10375049Sis {
10385049Sis uchar_t my_comb_class;
10395049Sis size_t i;
10405049Sis
10415049Sis my_comb_class = comb_class[last];
10425049Sis for (i = 1; i < last; i++)
10435049Sis if (comb_class[i] >= my_comb_class ||
10445049Sis comb_class[i] == U8_COMBINING_CLASS_STARTER)
10455049Sis return (B_TRUE);
10465049Sis
10475049Sis return (B_FALSE);
10485049Sis }
10495049Sis
10505049Sis /*
10515049Sis * The do_composition() reads the character string pointed by 's' and
10525049Sis * do necessary canonical composition and then copy over the result back to
10535049Sis * the 's'.
10545049Sis *
10555049Sis * The input argument 's' cannot contain more than 32 characters.
10565049Sis */
10575049Sis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10585049Sis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10595049Sis uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10605049Sis {
10615049Sis uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10625049Sis uchar_t tc[U8_MB_CUR_MAX];
10635049Sis uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10645049Sis size_t saved_marks_count;
10655049Sis uchar_t *p;
10665049Sis uchar_t *saved_p;
10675049Sis uchar_t *q;
10685049Sis size_t i;
10695049Sis size_t saved_i;
10705049Sis size_t j;
10715049Sis size_t k;
10725049Sis size_t l;
10735049Sis size_t C;
10745049Sis size_t saved_l;
10755049Sis size_t size;
10765049Sis uint32_t u1;
10775049Sis uint32_t u2;
10785049Sis boolean_t match_not_found = B_TRUE;
10795049Sis
10805049Sis /*
10815049Sis * This should never happen unless the callers are doing some strange
10825049Sis * and unexpected things.
10835049Sis *
10845049Sis * The "last" is the index pointing to the last character not last + 1.
10855049Sis */
10865049Sis if (last >= U8_MAX_CHARS_A_SEQ)
10875049Sis last = U8_UPPER_LIMIT_IN_A_SEQ;
10885049Sis
10895049Sis for (i = l = 0; i <= last; i++) {
10905049Sis /*
10915049Sis * The last or any non-Starters at the beginning, we don't
10925049Sis * have any chance to do composition and so we just copy them
10935049Sis * to the temporary buffer.
10945049Sis */
10955049Sis if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10965049Sis SAVE_THE_CHAR:
10975049Sis p = s + start[i];
10985049Sis size = disp[i];
10995049Sis for (k = 0; k < size; k++)
11005049Sis t[l++] = *p++;
11015049Sis continue;
11025049Sis }
11035049Sis
11045049Sis /*
11055049Sis * If this could be a start of Hangul Jamos, then, we try to
11065049Sis * conjoin them.
11075049Sis */
11085049Sis if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11095049Sis U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11105049Sis s[start[i] + 1], s[start[i] + 2]);
11115049Sis U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11125049Sis s[start[i] + 4], s[start[i] + 5]);
11135049Sis
11145049Sis if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11155049Sis u1 -= U8_HANGUL_JAMO_L_FIRST;
11165049Sis u2 -= U8_HANGUL_JAMO_V_FIRST;
11175049Sis u1 = U8_HANGUL_SYL_FIRST +
11185049Sis (u1 * U8_HANGUL_V_COUNT + u2) *
11195049Sis U8_HANGUL_T_COUNT;
11205049Sis
11215049Sis i += 2;
11225049Sis if (i <= last) {
11235049Sis U8_PUT_3BYTES_INTO_UTF32(u2,
11245049Sis s[start[i]], s[start[i] + 1],
11255049Sis s[start[i] + 2]);
11265049Sis
11275049Sis if (U8_HANGUL_JAMO_T(u2)) {
11285049Sis u1 += u2 -
11295049Sis U8_HANGUL_JAMO_T_FIRST;
11305049Sis i++;
11315049Sis }
11325049Sis }
11335049Sis
11345049Sis U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11355049Sis i--;
11365049Sis l += 3;
11375049Sis continue;
11385049Sis }
11395049Sis }
11405049Sis
11415049Sis /*
11425049Sis * Let's then find out if this Starter has composition
11435049Sis * mapping.
11445049Sis */
11455049Sis p = find_composition_start(uv, s + start[i], disp[i]);
11465049Sis if (p == NULL)
11475049Sis goto SAVE_THE_CHAR;
11485049Sis
11495049Sis /*
11505049Sis * We have a Starter with composition mapping and the next
11515049Sis * character is a non-Starter. Let's try to find out if
11525049Sis * we can do composition.
11535049Sis */
11545049Sis
11555049Sis saved_p = p;
11565049Sis saved_i = i;
11575049Sis saved_l = l;
11585049Sis saved_marks_count = 0;
11595049Sis
11605049Sis TRY_THE_NEXT_MARK:
11615049Sis q = s + start[++i];
11625049Sis size = disp[i];
11635049Sis
11645049Sis /*
11655049Sis * The next for() loop compares the non-Starter pointed by
11665049Sis * 'q' with the possible (joinable) characters pointed by 'p'.
11675049Sis *
11685049Sis * The composition final table entry pointed by the 'p'
11695049Sis * looks like the following:
11705049Sis *
11715049Sis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11725049Sis * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11735049Sis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11745049Sis *
11755049Sis * where C is the count byte indicating the number of
11765049Sis * mapping pairs where each pair would be look like
11775049Sis * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11785049Sis * character of a canonical decomposition and the B0-Bm are
11795049Sis * the bytes of a matching composite character. The F is
11805049Sis * a filler byte after each character as the separator.
11815049Sis */
11825049Sis
11835049Sis match_not_found = B_TRUE;
11845049Sis
11855049Sis for (C = *p++; C > 0; C--) {
11865049Sis for (k = 0; k < size; p++, k++)
11875049Sis if (*p != q[k])
11885049Sis break;
11895049Sis
11905049Sis /* Have we found it? */
11915049Sis if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11925049Sis match_not_found = B_FALSE;
11935049Sis
11945049Sis l = saved_l;
11955049Sis
11965049Sis while (*++p != U8_TBL_ELEMENT_FILLER)
11975049Sis t[l++] = *p;
11985049Sis
11995049Sis break;
12005049Sis }
12015049Sis
12025049Sis /* We didn't find; skip to the next pair. */
12035049Sis if (*p != U8_TBL_ELEMENT_FILLER)
12045049Sis while (*++p != U8_TBL_ELEMENT_FILLER)
12055049Sis ;
12065049Sis while (*++p != U8_TBL_ELEMENT_FILLER)
12075049Sis ;
12085049Sis p++;
12095049Sis }
12105049Sis
12115049Sis /*
12125049Sis * If there was no match, we will need to save the combining
12135049Sis * mark for later appending. After that, if the next one
12145049Sis * is a non-Starter and not blocked, then, we try once
12155049Sis * again to do composition with the next non-Starter.
12165049Sis *
12175049Sis * If there was no match and this was a Starter, then,
12185049Sis * this is a new start.
12195049Sis *
12205049Sis * If there was a match and a composition done and we have
12215049Sis * more to check on, then, we retrieve a new composition final
12225049Sis * table entry for the composite and then try to do the
12235049Sis * composition again.
12245049Sis */
12255049Sis
12265049Sis if (match_not_found) {
12275049Sis if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12285049Sis i--;
12295049Sis goto SAVE_THE_CHAR;
12305049Sis }
12315049Sis
12325049Sis saved_marks[saved_marks_count++] = i;
12335049Sis }
12345049Sis
12355049Sis if (saved_l == l) {
12365049Sis while (i < last) {
12375049Sis if (blocked(comb_class, i + 1))
12385049Sis saved_marks[saved_marks_count++] = ++i;
12395049Sis else
12405049Sis break;
12415049Sis }
12425049Sis if (i < last) {
12435049Sis p = saved_p;
12445049Sis goto TRY_THE_NEXT_MARK;
12455049Sis }
12465049Sis } else if (i < last) {
12475049Sis p = find_composition_start(uv, t + saved_l,
12485049Sis l - saved_l);
12495049Sis if (p != NULL) {
12505049Sis saved_p = p;
12515049Sis goto TRY_THE_NEXT_MARK;
12525049Sis }
12535049Sis }
12545049Sis
12555049Sis /*
12565049Sis * There is no more composition possible.
12575049Sis *
12585049Sis * If there was no composition what so ever then we copy
12595049Sis * over the original Starter and then append any non-Starters
12605049Sis * remaining at the target string sequentially after that.
12615049Sis */
12625049Sis
12635049Sis if (saved_l == l) {
12645049Sis p = s + start[saved_i];
12655049Sis size = disp[saved_i];
12665049Sis for (j = 0; j < size; j++)
12675049Sis t[l++] = *p++;
12685049Sis }
12695049Sis
12705049Sis for (k = 0; k < saved_marks_count; k++) {
12715049Sis p = s + start[saved_marks[k]];
12725049Sis size = disp[saved_marks[k]];
12735049Sis for (j = 0; j < size; j++)
12745049Sis t[l++] = *p++;
12755049Sis }
12765049Sis }
12775049Sis
12785049Sis /*
12795049Sis * If the last character is a Starter and if we have a character
12805049Sis * (possibly another Starter) that can be turned into a composite,
12815049Sis * we do so and we do so until there is no more of composition
12825049Sis * possible.
12835049Sis */
12845049Sis if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12855049Sis p = *os;
12865049Sis saved_l = l - disp[last];
12875049Sis
12885049Sis while (p < oslast) {
12895049Sis size = u8_number_of_bytes[*p];
12905049Sis if (size <= 1 || (p + size) > oslast)
12915049Sis break;
12925049Sis
12935049Sis saved_p = p;
12945049Sis
12955049Sis for (i = 0; i < size; i++)
12965049Sis tc[i] = *p++;
12975049Sis
12985049Sis q = find_composition_start(uv, t + saved_l,
12995049Sis l - saved_l);
13005049Sis if (q == NULL) {
13015049Sis p = saved_p;
13025049Sis break;
13035049Sis }
13045049Sis
13055049Sis match_not_found = B_TRUE;
13065049Sis
13075049Sis for (C = *q++; C > 0; C--) {
13085049Sis for (k = 0; k < size; q++, k++)
13095049Sis if (*q != tc[k])
13105049Sis break;
13115049Sis
13125049Sis if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13135049Sis match_not_found = B_FALSE;
13145049Sis
13155049Sis l = saved_l;
13165049Sis
13175049Sis while (*++q != U8_TBL_ELEMENT_FILLER) {
13185049Sis /*
13195049Sis * This is practically
13205049Sis * impossible but we don't
13215049Sis * want to take any chances.
13225049Sis */
13235049Sis if (l >=
13245049Sis U8_STREAM_SAFE_TEXT_MAX) {
13255049Sis p = saved_p;
13265049Sis goto SAFE_RETURN;
13275049Sis }
13285049Sis t[l++] = *q;
13295049Sis }
13305049Sis
13315049Sis break;
13325049Sis }
13335049Sis
13345049Sis if (*q != U8_TBL_ELEMENT_FILLER)
13355049Sis while (*++q != U8_TBL_ELEMENT_FILLER)
13365049Sis ;
13375049Sis while (*++q != U8_TBL_ELEMENT_FILLER)
13385049Sis ;
13395049Sis q++;
13405049Sis }
13415049Sis
13425049Sis if (match_not_found) {
13435049Sis p = saved_p;
13445049Sis break;
13455049Sis }
13465049Sis }
13475049Sis SAFE_RETURN:
13485049Sis *os = p;
13495049Sis }
13505049Sis
13515049Sis /*
13525049Sis * Now we copy over the temporary string to the target string.
13535049Sis * Since composition always reduces the number of characters or
13545049Sis * the number of characters stay, we don't need to worry about
13555049Sis * the buffer overflow here.
13565049Sis */
13575049Sis for (i = 0; i < l; i++)
13585049Sis s[i] = t[i];
13595049Sis s[l] = '\0';
13605049Sis
13615049Sis return (l);
13625049Sis }
13635049Sis
13645049Sis /*
13655049Sis * The collect_a_seq() function checks on the given string s, collect
13665049Sis * a sequence of characters at u8s, and return the sequence. While it collects
13675049Sis * a sequence, it also applies case conversion, canonical or compatibility
13685049Sis * decomposition, canonical decomposition, or some or all of them and
13695049Sis * in that order.
13705049Sis *
13715049Sis * The collected sequence cannot be bigger than 32 characters since if
13725049Sis * it is having more than 31 characters, the sequence will be terminated
13735049Sis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13745049Sis * a Stream-Safe Text. The collected sequence is always terminated with
13755049Sis * a null byte and the return value is the byte length of the sequence
13765049Sis * including 0. The return value does not include the terminating
13775049Sis * null byte.
13785049Sis */
13795049Sis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13805049Sis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13815049Sis boolean_t is_it_toupper,
13825049Sis boolean_t is_it_tolower,
13835049Sis boolean_t canonical_decomposition,
13845049Sis boolean_t compatibility_decomposition,
13855049Sis boolean_t canonical_composition,
1386*7012Sis int *errnum, u8_normalization_states_t *state)
13875049Sis {
13885049Sis uchar_t *s;
13895049Sis int sz;
13905049Sis int saved_sz;
13915049Sis size_t i;
13925049Sis size_t j;
13935049Sis size_t k;
13945049Sis size_t l;
13955049Sis uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
13965049Sis uchar_t disp[U8_MAX_CHARS_A_SEQ];
13975049Sis uchar_t start[U8_MAX_CHARS_A_SEQ];
13985049Sis uchar_t u8t[U8_MB_CUR_MAX];
13995049Sis uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14005049Sis uchar_t tc;
14015049Sis size_t last;
14025049Sis size_t saved_last;
14035049Sis uint32_t u1;
14045049Sis
14055049Sis /*
14065049Sis * Save the source string pointer which we will return a changed
14075049Sis * pointer if we do processing.
14085049Sis */
14095049Sis s = *source;
14105049Sis
14115049Sis /*
14125049Sis * The following is a fallback for just in case callers are not
14135049Sis * checking the string boundaries before the calling.
14145049Sis */
14155049Sis if (s >= slast) {
14165049Sis u8s[0] = '\0';
14175049Sis
14185049Sis return (0);
14195049Sis }
14205049Sis
14215049Sis /*
14225049Sis * As the first thing, let's collect a character and do case
14235049Sis * conversion if necessary.
14245049Sis */
14255049Sis
14265049Sis sz = u8_number_of_bytes[*s];
14275049Sis
14285049Sis if (sz < 0) {
1429*7012Sis *errnum = EILSEQ;
14305049Sis
14315049Sis u8s[0] = *s++;
14325049Sis u8s[1] = '\0';
14335049Sis
14345049Sis *source = s;
14355049Sis
14365049Sis return (1);
14375049Sis }
14385049Sis
14395049Sis if (sz == 1) {
14405049Sis if (is_it_toupper)
14415049Sis u8s[0] = U8_ASCII_TOUPPER(*s);
14425049Sis else if (is_it_tolower)
14435049Sis u8s[0] = U8_ASCII_TOLOWER(*s);
14445049Sis else
14455049Sis u8s[0] = *s;
14465049Sis s++;
14475049Sis u8s[1] = '\0';
14485049Sis } else if ((s + sz) > slast) {
1449*7012Sis *errnum = EINVAL;
14505049Sis
14515049Sis for (i = 0; s < slast; )
14525049Sis u8s[i++] = *s++;
14535049Sis u8s[i] = '\0';
14545049Sis
14555049Sis *source = s;
14565049Sis
14575049Sis return (i);
14585049Sis } else {
14595049Sis if (is_it_toupper || is_it_tolower) {
14605049Sis i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14615049Sis s += sz;
14625049Sis sz = i;
14635049Sis } else {
14645049Sis for (i = 0; i < sz; )
14655049Sis u8s[i++] = *s++;
14665049Sis u8s[i] = '\0';
14675049Sis }
14685049Sis }
14695049Sis
14705049Sis /*
14715049Sis * And then canonical/compatibility decomposition followed by
14725049Sis * an optional canonical composition. Please be noted that
14735049Sis * canonical composition is done only when a decomposition is
14745049Sis * done.
14755049Sis */
14765049Sis if (canonical_decomposition || compatibility_decomposition) {
14775049Sis if (sz == 1) {
14785049Sis *state = U8_STATE_START;
14795049Sis
14805049Sis saved_sz = 1;
14815049Sis
14825049Sis comb_class[0] = 0;
14835049Sis start[0] = 0;
14845049Sis disp[0] = 1;
14855049Sis
14865049Sis last = 1;
14875049Sis } else {
14885049Sis saved_sz = do_decomp(uv, u8s, u8s, sz,
14895049Sis canonical_decomposition, state);
14905049Sis
14915049Sis last = 0;
14925049Sis
14935049Sis for (i = 0; i < saved_sz; ) {
14945049Sis sz = u8_number_of_bytes[u8s[i]];
14955049Sis
14965049Sis comb_class[last] = combining_class(uv,
14975049Sis u8s + i, sz);
14985049Sis start[last] = i;
14995049Sis disp[last] = sz;
15005049Sis
15015049Sis last++;
15025049Sis i += sz;
15035049Sis }
15045049Sis
15055049Sis /*
15065049Sis * Decomposition yields various Hangul related
15075049Sis * states but not on combining marks. We need to
15085049Sis * find out at here by checking on the last
15095049Sis * character.
15105049Sis */
15115049Sis if (*state == U8_STATE_START) {
15125049Sis if (comb_class[last - 1])
15135049Sis *state = U8_STATE_COMBINING_MARK;
15145049Sis }
15155049Sis }
15165049Sis
15175049Sis saved_last = last;
15185049Sis
15195049Sis while (s < slast) {
15205049Sis sz = u8_number_of_bytes[*s];
15215049Sis
15225049Sis /*
15235049Sis * If this is an illegal character, an incomplete
15245049Sis * character, or an 7-bit ASCII Starter character,
15255049Sis * then we have collected a sequence; break and let
15265049Sis * the next call deal with the two cases.
15275049Sis *
15285049Sis * Note that this is okay only if you are using this
15295049Sis * function with a fixed length string, not on
15305049Sis * a buffer with multiple calls of one chunk at a time.
15315049Sis */
15325049Sis if (sz <= 1) {
15335049Sis break;
15345049Sis } else if ((s + sz) > slast) {
15355049Sis break;
15365049Sis } else {
15375049Sis /*
15385049Sis * If the previous character was a Hangul Jamo
15395049Sis * and this character is a Hangul Jamo that
15405049Sis * can be conjoined, we collect the Jamo.
15415049Sis */
15425049Sis if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15435049Sis U8_PUT_3BYTES_INTO_UTF32(u1,
15445049Sis *s, *(s + 1), *(s + 2));
15455049Sis
15465049Sis if (U8_HANGUL_COMPOSABLE_L_V(*state,
15475049Sis u1)) {
15485049Sis i = 0;
15495049Sis *state = U8_STATE_HANGUL_LV;
15505049Sis goto COLLECT_A_HANGUL;
15515049Sis }
15525049Sis
15535049Sis if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15545049Sis u1)) {
15555049Sis i = 0;
15565049Sis *state = U8_STATE_HANGUL_LVT;
15575049Sis goto COLLECT_A_HANGUL;
15585049Sis }
15595049Sis }
15605049Sis
15615049Sis /*
15625049Sis * Regardless of whatever it was, if this is
15635049Sis * a Starter, we don't collect the character
15645049Sis * since that's a new start and we will deal
15655049Sis * with it at the next time.
15665049Sis */
15675049Sis i = combining_class(uv, s, sz);
15685049Sis if (i == U8_COMBINING_CLASS_STARTER)
15695049Sis break;
15705049Sis
15715049Sis /*
15725049Sis * We know the current character is a combining
15735049Sis * mark. If the previous character wasn't
15745049Sis * a Starter (not Hangul) or a combining mark,
15755049Sis * then, we don't collect this combining mark.
15765049Sis */
15775049Sis if (*state != U8_STATE_START &&
15785049Sis *state != U8_STATE_COMBINING_MARK)
15795049Sis break;
15805049Sis
15815049Sis *state = U8_STATE_COMBINING_MARK;
15825049Sis COLLECT_A_HANGUL:
15835049Sis /*
15845049Sis * If we collected a Starter and combining
15855049Sis * marks up to 30, i.e., total 31 characters,
15865049Sis * then, we terminate this degenerately long
15875049Sis * combining sequence with a U+034F COMBINING
15885049Sis * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15895049Sis * UTF-8 and turn this into a Stream-Safe
15905049Sis * Text. This will be extremely rare but
15915049Sis * possible.
15925049Sis *
15935049Sis * The following will also guarantee that
15945049Sis * we are not writing more than 32 characters
15955049Sis * plus a NULL at u8s[].
15965049Sis */
15975049Sis if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
15985049Sis TURN_STREAM_SAFE:
15995049Sis *state = U8_STATE_START;
16005049Sis comb_class[last] = 0;
16015049Sis start[last] = saved_sz;
16025049Sis disp[last] = 2;
16035049Sis last++;
16045049Sis
16055049Sis u8s[saved_sz++] = 0xCD;
16065049Sis u8s[saved_sz++] = 0x8F;
16075049Sis
16085049Sis break;
16095049Sis }
16105049Sis
16115049Sis /*
16125049Sis * Some combining marks also do decompose into
16135049Sis * another combining mark or marks.
16145049Sis */
16155049Sis if (*state == U8_STATE_COMBINING_MARK) {
16165049Sis k = last;
16175049Sis l = sz;
16185049Sis i = do_decomp(uv, uts, s, sz,
16195049Sis canonical_decomposition, state);
16205049Sis for (j = 0; j < i; ) {
16215049Sis sz = u8_number_of_bytes[uts[j]];
16225049Sis
16235049Sis comb_class[last] =
16245049Sis combining_class(uv,
16255049Sis uts + j, sz);
16265049Sis start[last] = saved_sz + j;
16275049Sis disp[last] = sz;
16285049Sis
16295049Sis last++;
16305049Sis if (last >=
16315049Sis U8_UPPER_LIMIT_IN_A_SEQ) {
16325049Sis last = k;
16335049Sis goto TURN_STREAM_SAFE;
16345049Sis }
16355049Sis j += sz;
16365049Sis }
16375049Sis
16385049Sis *state = U8_STATE_COMBINING_MARK;
16395049Sis sz = i;
16405049Sis s += l;
16415049Sis
16425049Sis for (i = 0; i < sz; i++)
16435049Sis u8s[saved_sz++] = uts[i];
16445049Sis } else {
16455049Sis comb_class[last] = i;
16465049Sis start[last] = saved_sz;
16475049Sis disp[last] = sz;
16485049Sis last++;
16495049Sis
16505049Sis for (i = 0; i < sz; i++)
16515049Sis u8s[saved_sz++] = *s++;
16525049Sis }
16535049Sis
16545049Sis /*
16555049Sis * If this is U+0345 COMBINING GREEK
16565049Sis * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16575049Sis * iota subscript, and need to be converted to
16585049Sis * uppercase letter, convert it to U+0399 GREEK
16595049Sis * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16605049Sis * i.e., convert to capital adscript form as
16615049Sis * specified in the Unicode standard.
16625049Sis *
16635049Sis * This is the only special case of (ambiguous)
16645049Sis * case conversion at combining marks and
16655049Sis * probably the standard will never have
16665049Sis * anything similar like this in future.
16675049Sis */
16685049Sis if (is_it_toupper && sz >= 2 &&
16695049Sis u8s[saved_sz - 2] == 0xCD &&
16705049Sis u8s[saved_sz - 1] == 0x85) {
16715049Sis u8s[saved_sz - 2] = 0xCE;
16725049Sis u8s[saved_sz - 1] = 0x99;
16735049Sis }
16745049Sis }
16755049Sis }
16765049Sis
16775049Sis /*
16785049Sis * Let's try to ensure a canonical ordering for the collected
16795049Sis * combining marks. We do this only if we have collected
16805049Sis * at least one more non-Starter. (The decomposition mapping
16815049Sis * data tables have fully (and recursively) expanded and
16825049Sis * canonically ordered decompositions.)
16835049Sis *
16845049Sis * The U8_SWAP_COMB_MARKS() convenience macro has some
16855049Sis * assumptions and we are meeting the assumptions.
16865049Sis */
16875049Sis last--;
16885049Sis if (last >= saved_last) {
16895049Sis for (i = 0; i < last; i++)
16905049Sis for (j = last; j > i; j--)
16915049Sis if (comb_class[j] &&
16925049Sis comb_class[j - 1] > comb_class[j]) {
16935049Sis U8_SWAP_COMB_MARKS(j - 1, j);
16945049Sis }
16955049Sis }
16965049Sis
16975049Sis *source = s;
16985049Sis
16995049Sis if (! canonical_composition) {
17005049Sis u8s[saved_sz] = '\0';
17015049Sis return (saved_sz);
17025049Sis }
17035049Sis
17045049Sis /*
17055049Sis * Now do the canonical composition. Note that we do this
17065049Sis * only after a canonical or compatibility decomposition to
17075049Sis * finish up NFC or NFKC.
17085049Sis */
17095049Sis sz = do_composition(uv, u8s, comb_class, start, disp, last,
17105049Sis &s, slast);
17115049Sis }
17125049Sis
17135049Sis *source = s;
17145049Sis
17155049Sis return ((size_t)sz);
17165049Sis }
17175049Sis
17185049Sis /*
17195049Sis * The do_norm_compare() function does string comparion based on Unicode
17205049Sis * simple case mappings and Unicode Normalization definitions.
17215049Sis *
17225049Sis * It does so by collecting a sequence of character at a time and comparing
17235049Sis * the collected sequences from the strings.
17245049Sis *
17255049Sis * The meanings on the return values are the same as the usual strcmp().
17265049Sis */
17275049Sis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17285049Sis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729*7012Sis int flag, int *errnum)
17305049Sis {
17315049Sis int result;
17325049Sis size_t sz1;
17335049Sis size_t sz2;
17345049Sis uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17355049Sis uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17365049Sis uchar_t *s1last;
17375049Sis uchar_t *s2last;
17385049Sis boolean_t is_it_toupper;
17395049Sis boolean_t is_it_tolower;
17405049Sis boolean_t canonical_decomposition;
17415049Sis boolean_t compatibility_decomposition;
17425049Sis boolean_t canonical_composition;
17435049Sis u8_normalization_states_t state;
17445049Sis
17455049Sis s1last = s1 + n1;
17465049Sis s2last = s2 + n2;
17475049Sis
17485049Sis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17495049Sis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17505049Sis canonical_decomposition = flag & U8_CANON_DECOMP;
17515049Sis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17525049Sis canonical_composition = flag & U8_CANON_COMP;
17535049Sis
17545049Sis while (s1 < s1last && s2 < s2last) {
17555049Sis /*
17565049Sis * If the current character is a 7-bit ASCII and the last
17575049Sis * character, or, if the current character and the next
17585049Sis * character are both some 7-bit ASCII characters then
17595049Sis * we treat the current character as a sequence.
17605049Sis *
17615049Sis * In any other cases, we need to call collect_a_seq().
17625049Sis */
17635049Sis
17645049Sis if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17655049Sis ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17665049Sis if (is_it_toupper)
17675049Sis u8s1[0] = U8_ASCII_TOUPPER(*s1);
17685049Sis else if (is_it_tolower)
17695049Sis u8s1[0] = U8_ASCII_TOLOWER(*s1);
17705049Sis else
17715049Sis u8s1[0] = *s1;
17725049Sis u8s1[1] = '\0';
17735049Sis sz1 = 1;
17745049Sis s1++;
17755049Sis } else {
17765049Sis state = U8_STATE_START;
17775049Sis sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17785049Sis is_it_toupper, is_it_tolower,
17795049Sis canonical_decomposition,
17805049Sis compatibility_decomposition,
1781*7012Sis canonical_composition, errnum, &state);
17825049Sis }
17835049Sis
17845049Sis if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17855049Sis ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17865049Sis if (is_it_toupper)
17875049Sis u8s2[0] = U8_ASCII_TOUPPER(*s2);
17885049Sis else if (is_it_tolower)
17895049Sis u8s2[0] = U8_ASCII_TOLOWER(*s2);
17905049Sis else
17915049Sis u8s2[0] = *s2;
17925049Sis u8s2[1] = '\0';
17935049Sis sz2 = 1;
17945049Sis s2++;
17955049Sis } else {
17965049Sis state = U8_STATE_START;
17975049Sis sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
17985049Sis is_it_toupper, is_it_tolower,
17995049Sis canonical_decomposition,
18005049Sis compatibility_decomposition,
1801*7012Sis canonical_composition, errnum, &state);
18025049Sis }
18035049Sis
18045049Sis /*
18055049Sis * Now compare the two characters. If they are the same,
18065049Sis * we move on to the next character sequences.
18075049Sis */
18085049Sis if (sz1 == 1 && sz2 == 1) {
18095049Sis if (*u8s1 > *u8s2)
18105049Sis return (1);
18115049Sis if (*u8s1 < *u8s2)
18125049Sis return (-1);
18135049Sis } else {
18145049Sis result = strcmp((const char *)u8s1, (const char *)u8s2);
18155049Sis if (result != 0)
18165049Sis return (result);
18175049Sis }
18185049Sis }
18195049Sis
18205049Sis /*
18215049Sis * We compared until the end of either or both strings.
18225049Sis *
18235049Sis * If we reached to or went over the ends for the both, that means
18245049Sis * they are the same.
18255049Sis *
18265049Sis * If we reached only one end, that means the other string has
18275049Sis * something which then can be used to determine the return value.
18285049Sis */
18295049Sis if (s1 >= s1last) {
18305049Sis if (s2 >= s2last)
18315049Sis return (0);
18325049Sis return (-1);
18335049Sis }
18345049Sis return (1);
18355049Sis }
18365049Sis
18375049Sis /*
18385049Sis * The u8_strcmp() function compares two UTF-8 strings quite similar to
18395049Sis * the strcmp(). For the comparison, however, Unicode Normalization specific
18405049Sis * equivalency and Unicode simple case conversion mappings based equivalency
18415049Sis * can be requested and checked against.
18425049Sis */
18435049Sis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18445049Sis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845*7012Sis int *errnum)
18465049Sis {
18475049Sis int f;
18485049Sis size_t n1;
18495049Sis size_t n2;
18505049Sis
1851*7012Sis *errnum = 0;
18525049Sis
18535049Sis /*
18545049Sis * Check on the requested Unicode version, case conversion, and
18555049Sis * normalization flag values.
18565049Sis */
18575049Sis
18585049Sis if (uv > U8_UNICODE_LATEST) {
1859*7012Sis *errnum = ERANGE;
18605049Sis uv = U8_UNICODE_LATEST;
18615049Sis }
18625049Sis
18635049Sis if (flag == 0) {
18645049Sis flag = U8_STRCMP_CS;
18655049Sis } else {
18665049Sis f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18675049Sis U8_STRCMP_CI_LOWER);
18685049Sis if (f == 0) {
18695049Sis flag |= U8_STRCMP_CS;
18705049Sis } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18715049Sis f != U8_STRCMP_CI_LOWER) {
1872*7012Sis *errnum = EBADF;
18735049Sis flag = U8_STRCMP_CS;
18745049Sis }
18755049Sis
18765049Sis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18775049Sis if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18785049Sis f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879*7012Sis *errnum = EBADF;
18805049Sis flag = U8_STRCMP_CS;
18815049Sis }
18825049Sis }
18835049Sis
18845049Sis if (flag == U8_STRCMP_CS) {
18855049Sis return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18865049Sis }
18875049Sis
18885049Sis n1 = strlen(s1);
18895049Sis n2 = strlen(s2);
18905049Sis if (n != 0) {
18915049Sis if (n < n1)
18925049Sis n1 = n;
18935049Sis if (n < n2)
18945049Sis n2 = n;
18955049Sis }
18965049Sis
18975049Sis /*
18985049Sis * Simple case conversion can be done much faster and so we do
18995049Sis * them separately here.
19005049Sis */
19015049Sis if (flag == U8_STRCMP_CI_UPPER) {
19025049Sis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903*7012Sis n1, n2, B_TRUE, errnum));
19045049Sis } else if (flag == U8_STRCMP_CI_LOWER) {
19055049Sis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906*7012Sis n1, n2, B_FALSE, errnum));
19075049Sis }
19085049Sis
19095049Sis return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910*7012Sis flag, errnum));
19115049Sis }
19125049Sis
19135049Sis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19145049Sis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915*7012Sis int flag, size_t unicode_version, int *errnum)
19165049Sis {
19175049Sis int f;
19185049Sis int sz;
19195049Sis uchar_t *ib;
19205049Sis uchar_t *ibtail;
19215049Sis uchar_t *ob;
19225049Sis uchar_t *obtail;
19235049Sis boolean_t do_not_ignore_null;
19245049Sis boolean_t do_not_ignore_invalid;
19255049Sis boolean_t is_it_toupper;
19265049Sis boolean_t is_it_tolower;
19275049Sis boolean_t canonical_decomposition;
19285049Sis boolean_t compatibility_decomposition;
19295049Sis boolean_t canonical_composition;
19305049Sis size_t ret_val;
19315049Sis size_t i;
19325049Sis size_t j;
19335049Sis uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19345049Sis u8_normalization_states_t state;
19355049Sis
19365049Sis if (unicode_version > U8_UNICODE_LATEST) {
1937*7012Sis *errnum = ERANGE;
19385049Sis return ((size_t)-1);
19395049Sis }
19405049Sis
19415049Sis f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19425049Sis if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943*7012Sis *errnum = EBADF;
19445049Sis return ((size_t)-1);
19455049Sis }
19465049Sis
19475049Sis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19485049Sis if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19495049Sis f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950*7012Sis *errnum = EBADF;
19515049Sis return ((size_t)-1);
19525049Sis }
19535049Sis
19545049Sis if (inarray == NULL || *inlen == 0)
19555049Sis return (0);
19565049Sis
19575049Sis if (outarray == NULL) {
1958*7012Sis *errnum = E2BIG;
19595049Sis return ((size_t)-1);
19605049Sis }
19615049Sis
19625049Sis ib = (uchar_t *)inarray;
19635049Sis ob = (uchar_t *)outarray;
19645049Sis ibtail = ib + *inlen;
19655049Sis obtail = ob + *outlen;
19665049Sis
19675049Sis do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19685049Sis do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19695049Sis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19705049Sis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19715049Sis
19725049Sis ret_val = 0;
19735049Sis
19745049Sis /*
19755049Sis * If we don't have a normalization flag set, we do the simple case
19765049Sis * conversion based text preparation separately below. Text
19775049Sis * preparation involving Normalization will be done in the false task
19785049Sis * block, again, separately since it will take much more time and
19795049Sis * resource than doing simple case conversions.
19805049Sis */
19815049Sis if (f == 0) {
19825049Sis while (ib < ibtail) {
19835049Sis if (*ib == '\0' && do_not_ignore_null)
19845049Sis break;
19855049Sis
19865049Sis sz = u8_number_of_bytes[*ib];
19875049Sis
19885049Sis if (sz < 0) {
19895049Sis if (do_not_ignore_invalid) {
1990*7012Sis *errnum = EILSEQ;
19915049Sis ret_val = (size_t)-1;
19925049Sis break;
19935049Sis }
19945049Sis
19955049Sis sz = 1;
19965049Sis ret_val++;
19975049Sis }
19985049Sis
19995049Sis if (sz == 1) {
20005049Sis if (ob >= obtail) {
2001*7012Sis *errnum = E2BIG;
20025049Sis ret_val = (size_t)-1;
20035049Sis break;
20045049Sis }
20055049Sis
20065049Sis if (is_it_toupper)
20075049Sis *ob = U8_ASCII_TOUPPER(*ib);
20085049Sis else if (is_it_tolower)
20095049Sis *ob = U8_ASCII_TOLOWER(*ib);
20105049Sis else
20115049Sis *ob = *ib;
20125049Sis ib++;
20135049Sis ob++;
20145049Sis } else if ((ib + sz) > ibtail) {
20155049Sis if (do_not_ignore_invalid) {
2016*7012Sis *errnum = EINVAL;
20175049Sis ret_val = (size_t)-1;
20185049Sis break;
20195049Sis }
20205049Sis
20215049Sis if ((obtail - ob) < (ibtail - ib)) {
2022*7012Sis *errnum = E2BIG;
20235049Sis ret_val = (size_t)-1;
20245049Sis break;
20255049Sis }
20265049Sis
20275049Sis /*
20285049Sis * We treat the remaining incomplete character
20295049Sis * bytes as a character.
20305049Sis */
20315049Sis ret_val++;
20325049Sis
20335049Sis while (ib < ibtail)
20345049Sis *ob++ = *ib++;
20355049Sis } else {
20365049Sis if (is_it_toupper || is_it_tolower) {
20375049Sis i = do_case_conv(unicode_version, u8s,
20385049Sis ib, sz, is_it_toupper);
20395049Sis
20405049Sis if ((obtail - ob) < i) {
2041*7012Sis *errnum = E2BIG;
20425049Sis ret_val = (size_t)-1;
20435049Sis break;
20445049Sis }
20455049Sis
20465049Sis ib += sz;
20475049Sis
20485049Sis for (sz = 0; sz < i; sz++)
20495049Sis *ob++ = u8s[sz];
20505049Sis } else {
20515049Sis if ((obtail - ob) < sz) {
2052*7012Sis *errnum = E2BIG;
20535049Sis ret_val = (size_t)-1;
20545049Sis break;
20555049Sis }
20565049Sis
20575049Sis for (i = 0; i < sz; i++)
20585049Sis *ob++ = *ib++;
20595049Sis }
20605049Sis }
20615049Sis }
20625049Sis } else {
20635049Sis canonical_decomposition = flag & U8_CANON_DECOMP;
20645049Sis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20655049Sis canonical_composition = flag & U8_CANON_COMP;
20665049Sis
20675049Sis while (ib < ibtail) {
20685049Sis if (*ib == '\0' && do_not_ignore_null)
20695049Sis break;
20705049Sis
20715049Sis /*
20725049Sis * If the current character is a 7-bit ASCII
20735049Sis * character and it is the last character, or,
20745049Sis * if the current character is a 7-bit ASCII
20755049Sis * character and the next character is also a 7-bit
20765049Sis * ASCII character, then, we copy over this
20775049Sis * character without going through collect_a_seq().
20785049Sis *
20795049Sis * In any other cases, we need to look further with
20805049Sis * the collect_a_seq() function.
20815049Sis */
20825049Sis if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20835049Sis ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20845049Sis if (ob >= obtail) {
2085*7012Sis *errnum = E2BIG;
20865049Sis ret_val = (size_t)-1;
20875049Sis break;
20885049Sis }
20895049Sis
20905049Sis if (is_it_toupper)
20915049Sis *ob = U8_ASCII_TOUPPER(*ib);
20925049Sis else if (is_it_tolower)
20935049Sis *ob = U8_ASCII_TOLOWER(*ib);
20945049Sis else
20955049Sis *ob = *ib;
20965049Sis ib++;
20975049Sis ob++;
20985049Sis } else {
2099*7012Sis *errnum = 0;
21005049Sis state = U8_STATE_START;
21015049Sis
21025049Sis j = collect_a_seq(unicode_version, u8s,
21035049Sis &ib, ibtail,
21045049Sis is_it_toupper,
21055049Sis is_it_tolower,
21065049Sis canonical_decomposition,
21075049Sis compatibility_decomposition,
21085049Sis canonical_composition,
2109*7012Sis errnum, &state);
21105049Sis
2111*7012Sis if (*errnum && do_not_ignore_invalid) {
21125049Sis ret_val = (size_t)-1;
21135049Sis break;
21145049Sis }
21155049Sis
21165049Sis if ((obtail - ob) < j) {
2117*7012Sis *errnum = E2BIG;
21185049Sis ret_val = (size_t)-1;
21195049Sis break;
21205049Sis }
21215049Sis
21225049Sis for (i = 0; i < j; i++)
21235049Sis *ob++ = u8s[i];
21245049Sis }
21255049Sis }
21265049Sis }
21275049Sis
21285049Sis *inlen = ibtail - ib;
21295049Sis *outlen = obtail - ob;
21305049Sis
21315049Sis return (ret_val);
21325049Sis }
2133