1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 2615f0b8c3SMartin Matuska /* 2715f0b8c3SMartin Matuska * Copyright 2022 MNX Cloud, Inc. 2815f0b8c3SMartin Matuska */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy 31eda14cbcSMatt Macy 32eda14cbcSMatt Macy /* 33eda14cbcSMatt Macy * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). 34eda14cbcSMatt Macy * 35eda14cbcSMatt Macy * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), 36eda14cbcSMatt Macy * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also 37eda14cbcSMatt Macy * the section 3C man pages. 38eda14cbcSMatt Macy * Interface stability: Committed. 39eda14cbcSMatt Macy */ 40eda14cbcSMatt Macy 41eda14cbcSMatt Macy #include <sys/types.h> 42da5137abSMartin Matuska #include <sys/string.h> 43eda14cbcSMatt Macy #include <sys/param.h> 44eda14cbcSMatt Macy #include <sys/sysmacros.h> 45eda14cbcSMatt Macy #include <sys/debug.h> 46eda14cbcSMatt Macy #include <sys/kmem.h> 47eda14cbcSMatt Macy #include <sys/sunddi.h> 48eda14cbcSMatt Macy #include <sys/u8_textprep.h> 49eda14cbcSMatt Macy #include <sys/byteorder.h> 50eda14cbcSMatt Macy #include <sys/errno.h> 51eda14cbcSMatt Macy #include <sys/u8_textprep_data.h> 52eda14cbcSMatt Macy #include <sys/mod.h> 53eda14cbcSMatt Macy 54eda14cbcSMatt Macy /* The maximum possible number of bytes in a UTF-8 character. */ 55eda14cbcSMatt Macy #define U8_MB_CUR_MAX (4) 56eda14cbcSMatt Macy 57eda14cbcSMatt Macy /* 58eda14cbcSMatt Macy * The maximum number of bytes needed for a UTF-8 character to cover 59eda14cbcSMatt Macy * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. 60eda14cbcSMatt Macy */ 61eda14cbcSMatt Macy #define U8_MAX_BYTES_UCS2 (3) 62eda14cbcSMatt Macy 63eda14cbcSMatt Macy /* The maximum possible number of bytes in a Stream-Safe Text. */ 64eda14cbcSMatt Macy #define U8_STREAM_SAFE_TEXT_MAX (128) 65eda14cbcSMatt Macy 66eda14cbcSMatt Macy /* 67eda14cbcSMatt Macy * The maximum number of characters in a combining/conjoining sequence and 68eda14cbcSMatt Macy * the actual upperbound limit of a combining/conjoining sequence. 69eda14cbcSMatt Macy */ 70eda14cbcSMatt Macy #define U8_MAX_CHARS_A_SEQ (32) 71eda14cbcSMatt Macy #define U8_UPPER_LIMIT_IN_A_SEQ (31) 72eda14cbcSMatt Macy 73eda14cbcSMatt Macy /* The combining class value for Starter. */ 74eda14cbcSMatt Macy #define U8_COMBINING_CLASS_STARTER (0) 75eda14cbcSMatt Macy 76eda14cbcSMatt Macy /* 77eda14cbcSMatt Macy * Some Hangul related macros at below. 78eda14cbcSMatt Macy * 79eda14cbcSMatt Macy * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, 80eda14cbcSMatt Macy * Vowels, and optional Trailing consonants in Unicode scalar values. 81eda14cbcSMatt Macy * 82eda14cbcSMatt Macy * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not 83eda14cbcSMatt Macy * the actual U+11A8. This is due to that the trailing consonant is optional 84eda14cbcSMatt Macy * and thus we are doing a pre-calculation of subtracting one. 85eda14cbcSMatt Macy * 86eda14cbcSMatt Macy * Each of 19 modern leading consonants has total 588 possible syllables since 87eda14cbcSMatt Macy * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for 88eda14cbcSMatt Macy * no trailing consonant case, i.e., 21 x 28 = 588. 89eda14cbcSMatt Macy * 90eda14cbcSMatt Macy * We also have bunch of Hangul related macros at below. Please bear in mind 91eda14cbcSMatt Macy * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is 92eda14cbcSMatt Macy * a Hangul Jamo or not but the value does not guarantee that it is a Hangul 93eda14cbcSMatt Macy * Jamo; it just guarantee that it will be most likely. 94eda14cbcSMatt Macy */ 95eda14cbcSMatt Macy #define U8_HANGUL_SYL_FIRST (0xAC00U) 96eda14cbcSMatt Macy #define U8_HANGUL_SYL_LAST (0xD7A3U) 97eda14cbcSMatt Macy 98eda14cbcSMatt Macy #define U8_HANGUL_JAMO_L_FIRST (0x1100U) 99eda14cbcSMatt Macy #define U8_HANGUL_JAMO_L_LAST (0x1112U) 100eda14cbcSMatt Macy #define U8_HANGUL_JAMO_V_FIRST (0x1161U) 101eda14cbcSMatt Macy #define U8_HANGUL_JAMO_V_LAST (0x1175U) 102eda14cbcSMatt Macy #define U8_HANGUL_JAMO_T_FIRST (0x11A7U) 103eda14cbcSMatt Macy #define U8_HANGUL_JAMO_T_LAST (0x11C2U) 104eda14cbcSMatt Macy 105eda14cbcSMatt Macy #define U8_HANGUL_V_COUNT (21) 106eda14cbcSMatt Macy #define U8_HANGUL_VT_COUNT (588) 107eda14cbcSMatt Macy #define U8_HANGUL_T_COUNT (28) 108eda14cbcSMatt Macy 109eda14cbcSMatt Macy #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) 110eda14cbcSMatt Macy 111eda14cbcSMatt Macy #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ 112eda14cbcSMatt Macy (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ 113eda14cbcSMatt Macy (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ 114eda14cbcSMatt Macy (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); 115eda14cbcSMatt Macy 116eda14cbcSMatt Macy #define U8_HANGUL_JAMO_L(u) \ 117eda14cbcSMatt Macy ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) 118eda14cbcSMatt Macy 119eda14cbcSMatt Macy #define U8_HANGUL_JAMO_V(u) \ 120eda14cbcSMatt Macy ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy #define U8_HANGUL_JAMO_T(u) \ 123eda14cbcSMatt Macy ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy #define U8_HANGUL_JAMO(u) \ 126eda14cbcSMatt Macy ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 127eda14cbcSMatt Macy 128eda14cbcSMatt Macy #define U8_HANGUL_SYLLABLE(u) \ 129eda14cbcSMatt Macy ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) 130eda14cbcSMatt Macy 131eda14cbcSMatt Macy #define U8_HANGUL_COMPOSABLE_L_V(s, u) \ 132eda14cbcSMatt Macy ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) 133eda14cbcSMatt Macy 134eda14cbcSMatt Macy #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ 135eda14cbcSMatt Macy ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) 136eda14cbcSMatt Macy 137eda14cbcSMatt Macy /* The types of decomposition mappings. */ 138eda14cbcSMatt Macy #define U8_DECOMP_BOTH (0xF5U) 139eda14cbcSMatt Macy #define U8_DECOMP_CANONICAL (0xF6U) 140eda14cbcSMatt Macy 141eda14cbcSMatt Macy /* The indicator for 16-bit table. */ 142eda14cbcSMatt Macy #define U8_16BIT_TABLE_INDICATOR (0x8000U) 143eda14cbcSMatt Macy 144eda14cbcSMatt Macy /* The following are some convenience macros. */ 145eda14cbcSMatt Macy #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ 146eda14cbcSMatt Macy (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ 147eda14cbcSMatt Macy (((uint32_t)(b2) & 0x3F) << 6) | \ 148eda14cbcSMatt Macy ((uint32_t)(b3) & 0x3F)); 149eda14cbcSMatt Macy 150eda14cbcSMatt Macy #define U8_SIMPLE_SWAP(a, b, t) \ 151eda14cbcSMatt Macy (t) = (a); \ 152eda14cbcSMatt Macy (a) = (b); \ 153eda14cbcSMatt Macy (b) = (t); 154eda14cbcSMatt Macy 155eda14cbcSMatt Macy #define U8_ASCII_TOUPPER(c) \ 156eda14cbcSMatt Macy (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) 157eda14cbcSMatt Macy 158eda14cbcSMatt Macy #define U8_ASCII_TOLOWER(c) \ 159eda14cbcSMatt Macy (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) 160eda14cbcSMatt Macy 161eda14cbcSMatt Macy #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) 162eda14cbcSMatt Macy /* 163eda14cbcSMatt Macy * The following macro assumes that the two characters that are to be 164eda14cbcSMatt Macy * swapped are adjacent to each other and 'a' comes before 'b'. 165eda14cbcSMatt Macy * 166eda14cbcSMatt Macy * If the assumptions are not met, then, the macro will fail. 167eda14cbcSMatt Macy */ 168eda14cbcSMatt Macy #define U8_SWAP_COMB_MARKS(a, b) \ 169eda14cbcSMatt Macy for (k = 0; k < disp[(a)]; k++) \ 170eda14cbcSMatt Macy u8t[k] = u8s[start[(a)] + k]; \ 171eda14cbcSMatt Macy for (k = 0; k < disp[(b)]; k++) \ 172eda14cbcSMatt Macy u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ 173eda14cbcSMatt Macy start[(b)] = start[(a)] + disp[(b)]; \ 174eda14cbcSMatt Macy for (k = 0; k < disp[(a)]; k++) \ 175eda14cbcSMatt Macy u8s[start[(b)] + k] = u8t[k]; \ 176eda14cbcSMatt Macy U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ 177eda14cbcSMatt Macy U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); 178eda14cbcSMatt Macy 179eda14cbcSMatt Macy /* The possible states during normalization. */ 180eda14cbcSMatt Macy typedef enum { 181eda14cbcSMatt Macy U8_STATE_START = 0, 182eda14cbcSMatt Macy U8_STATE_HANGUL_L = 1, 183eda14cbcSMatt Macy U8_STATE_HANGUL_LV = 2, 184eda14cbcSMatt Macy U8_STATE_HANGUL_LVT = 3, 185eda14cbcSMatt Macy U8_STATE_HANGUL_V = 4, 186eda14cbcSMatt Macy U8_STATE_HANGUL_T = 5, 187eda14cbcSMatt Macy U8_STATE_COMBINING_MARK = 6 188eda14cbcSMatt Macy } u8_normalization_states_t; 189eda14cbcSMatt Macy 190eda14cbcSMatt Macy /* 191eda14cbcSMatt Macy * The three vectors at below are used to check bytes of a given UTF-8 192eda14cbcSMatt Macy * character are valid and not containing any malformed byte values. 193eda14cbcSMatt Macy * 194eda14cbcSMatt Macy * We used to have a quite relaxed UTF-8 binary representation but then there 195eda14cbcSMatt Macy * was some security related issues and so the Unicode Consortium defined 196eda14cbcSMatt Macy * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it 197eda14cbcSMatt Macy * one more time at the Unicode 3.2. The following three tables are based on 198eda14cbcSMatt Macy * that. 199eda14cbcSMatt Macy */ 200eda14cbcSMatt Macy 201eda14cbcSMatt Macy #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) 202eda14cbcSMatt Macy 203eda14cbcSMatt Macy #define I_ U8_ILLEGAL_CHAR 204eda14cbcSMatt Macy #define O_ U8_OUT_OF_RANGE_CHAR 205eda14cbcSMatt Macy 206e92ffd9bSMartin Matuska static const int8_t u8_number_of_bytes[0x100] = { 207eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 208eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 209eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 210eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 211eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 212eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 213eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 214eda14cbcSMatt Macy 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 215eda14cbcSMatt Macy 216eda14cbcSMatt Macy /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 217eda14cbcSMatt Macy I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 218eda14cbcSMatt Macy 219eda14cbcSMatt Macy /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 220eda14cbcSMatt Macy I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 221eda14cbcSMatt Macy 222eda14cbcSMatt Macy /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 223eda14cbcSMatt Macy I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 224eda14cbcSMatt Macy 225eda14cbcSMatt Macy /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 226eda14cbcSMatt Macy I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 227eda14cbcSMatt Macy 228eda14cbcSMatt Macy /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 229eda14cbcSMatt Macy I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 230eda14cbcSMatt Macy 231eda14cbcSMatt Macy /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 232eda14cbcSMatt Macy 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 233eda14cbcSMatt Macy 234eda14cbcSMatt Macy /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 235eda14cbcSMatt Macy 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 236eda14cbcSMatt Macy 237eda14cbcSMatt Macy /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 238eda14cbcSMatt Macy 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, 239eda14cbcSMatt Macy }; 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy #undef I_ 242eda14cbcSMatt Macy #undef O_ 243eda14cbcSMatt Macy 244e92ffd9bSMartin Matuska static const uint8_t u8_valid_min_2nd_byte[0x100] = { 245eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 246eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 247eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 248eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 249eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 250eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 251eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 252eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 253eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 254eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 255eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 256eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 257eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 258eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 259eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 260eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 261eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 262eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 263eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 264eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 265eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 266eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 267eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 268eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 269eda14cbcSMatt Macy /* C0 C1 C2 C3 C4 C5 C6 C7 */ 270eda14cbcSMatt Macy 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 271eda14cbcSMatt Macy /* C8 C9 CA CB CC CD CE CF */ 272eda14cbcSMatt Macy 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 273eda14cbcSMatt Macy /* D0 D1 D2 D3 D4 D5 D6 D7 */ 274eda14cbcSMatt Macy 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 275eda14cbcSMatt Macy /* D8 D9 DA DB DC DD DE DF */ 276eda14cbcSMatt Macy 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 277eda14cbcSMatt Macy /* E0 E1 E2 E3 E4 E5 E6 E7 */ 278eda14cbcSMatt Macy 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 279eda14cbcSMatt Macy /* E8 E9 EA EB EC ED EE EF */ 280eda14cbcSMatt Macy 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 281eda14cbcSMatt Macy /* F0 F1 F2 F3 F4 F5 F6 F7 */ 282eda14cbcSMatt Macy 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 283eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 284eda14cbcSMatt Macy }; 285eda14cbcSMatt Macy 286e92ffd9bSMartin Matuska static const uint8_t u8_valid_max_2nd_byte[0x100] = { 287eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 288eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 289eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 290eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 291eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 292eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 293eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 294eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 295eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 296eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 297eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 298eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 299eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 300eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 301eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 302eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 303eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 304eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 305eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 306eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 307eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 308eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 309eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 310eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 311eda14cbcSMatt Macy /* C0 C1 C2 C3 C4 C5 C6 C7 */ 312eda14cbcSMatt Macy 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 313eda14cbcSMatt Macy /* C8 C9 CA CB CC CD CE CF */ 314eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 315eda14cbcSMatt Macy /* D0 D1 D2 D3 D4 D5 D6 D7 */ 316eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 317eda14cbcSMatt Macy /* D8 D9 DA DB DC DD DE DF */ 318eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 319eda14cbcSMatt Macy /* E0 E1 E2 E3 E4 E5 E6 E7 */ 320eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 321eda14cbcSMatt Macy /* E8 E9 EA EB EC ED EE EF */ 322eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 323eda14cbcSMatt Macy /* F0 F1 F2 F3 F4 F5 F6 F7 */ 324eda14cbcSMatt Macy 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 325eda14cbcSMatt Macy 0, 0, 0, 0, 0, 0, 0, 0, 326eda14cbcSMatt Macy }; 327eda14cbcSMatt Macy 328eda14cbcSMatt Macy 329eda14cbcSMatt Macy /* 330eda14cbcSMatt Macy * The u8_validate() validates on the given UTF-8 character string and 331eda14cbcSMatt Macy * calculate the byte length. It is quite similar to mblen(3C) except that 332eda14cbcSMatt Macy * this will validate against the list of characters if required and 333eda14cbcSMatt Macy * specific to UTF-8 and Unicode. 334eda14cbcSMatt Macy */ 335eda14cbcSMatt Macy int 336180f8225SMatt Macy u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum) 337eda14cbcSMatt Macy { 338eda14cbcSMatt Macy uchar_t *ib; 339eda14cbcSMatt Macy uchar_t *ibtail; 340eda14cbcSMatt Macy uchar_t **p; 341eda14cbcSMatt Macy uchar_t *s1; 342eda14cbcSMatt Macy uchar_t *s2; 343eda14cbcSMatt Macy uchar_t f; 344eda14cbcSMatt Macy int sz; 345eda14cbcSMatt Macy size_t i; 346eda14cbcSMatt Macy int ret_val; 347eda14cbcSMatt Macy boolean_t second; 348eda14cbcSMatt Macy boolean_t no_need_to_validate_entire; 349eda14cbcSMatt Macy boolean_t check_additional; 350eda14cbcSMatt Macy boolean_t validate_ucs2_range_only; 351eda14cbcSMatt Macy 352eda14cbcSMatt Macy if (! u8str) 353eda14cbcSMatt Macy return (0); 354eda14cbcSMatt Macy 355eda14cbcSMatt Macy ib = (uchar_t *)u8str; 356eda14cbcSMatt Macy ibtail = ib + n; 357eda14cbcSMatt Macy 358eda14cbcSMatt Macy ret_val = 0; 359eda14cbcSMatt Macy 360eda14cbcSMatt Macy no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); 361eda14cbcSMatt Macy check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; 362eda14cbcSMatt Macy validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; 363eda14cbcSMatt Macy 364eda14cbcSMatt Macy while (ib < ibtail) { 365eda14cbcSMatt Macy /* 366eda14cbcSMatt Macy * The first byte of a UTF-8 character tells how many 367eda14cbcSMatt Macy * bytes will follow for the character. If the first byte 368eda14cbcSMatt Macy * is an illegal byte value or out of range value, we just 369eda14cbcSMatt Macy * return -1 with an appropriate error number. 370eda14cbcSMatt Macy */ 371eda14cbcSMatt Macy sz = u8_number_of_bytes[*ib]; 372eda14cbcSMatt Macy if (sz == U8_ILLEGAL_CHAR) { 373eda14cbcSMatt Macy *errnum = EILSEQ; 374eda14cbcSMatt Macy return (-1); 375eda14cbcSMatt Macy } 376eda14cbcSMatt Macy 377eda14cbcSMatt Macy if (sz == U8_OUT_OF_RANGE_CHAR || 378eda14cbcSMatt Macy (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { 379eda14cbcSMatt Macy *errnum = ERANGE; 380eda14cbcSMatt Macy return (-1); 381eda14cbcSMatt Macy } 382eda14cbcSMatt Macy 383eda14cbcSMatt Macy /* 384eda14cbcSMatt Macy * If we don't have enough bytes to check on, that's also 385eda14cbcSMatt Macy * an error. As you can see, we give illegal byte sequence 386eda14cbcSMatt Macy * checking higher priority then EINVAL cases. 387eda14cbcSMatt Macy */ 388eda14cbcSMatt Macy if ((ibtail - ib) < sz) { 389eda14cbcSMatt Macy *errnum = EINVAL; 390eda14cbcSMatt Macy return (-1); 391eda14cbcSMatt Macy } 392eda14cbcSMatt Macy 393eda14cbcSMatt Macy if (sz == 1) { 394eda14cbcSMatt Macy ib++; 395eda14cbcSMatt Macy ret_val++; 396eda14cbcSMatt Macy } else { 397eda14cbcSMatt Macy /* 398eda14cbcSMatt Macy * Check on the multi-byte UTF-8 character. For more 399eda14cbcSMatt Macy * details on this, see comment added for the used 400eda14cbcSMatt Macy * data structures at the beginning of the file. 401eda14cbcSMatt Macy */ 402eda14cbcSMatt Macy f = *ib++; 403eda14cbcSMatt Macy ret_val++; 404eda14cbcSMatt Macy second = B_TRUE; 405eda14cbcSMatt Macy for (i = 1; i < sz; i++) { 406eda14cbcSMatt Macy if (second) { 407eda14cbcSMatt Macy if (*ib < u8_valid_min_2nd_byte[f] || 408eda14cbcSMatt Macy *ib > u8_valid_max_2nd_byte[f]) { 409eda14cbcSMatt Macy *errnum = EILSEQ; 410eda14cbcSMatt Macy return (-1); 411eda14cbcSMatt Macy } 412eda14cbcSMatt Macy second = B_FALSE; 413eda14cbcSMatt Macy } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { 414eda14cbcSMatt Macy *errnum = EILSEQ; 415eda14cbcSMatt Macy return (-1); 416eda14cbcSMatt Macy } 417eda14cbcSMatt Macy ib++; 418eda14cbcSMatt Macy ret_val++; 419eda14cbcSMatt Macy } 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422eda14cbcSMatt Macy if (check_additional) { 423eda14cbcSMatt Macy for (p = (uchar_t **)list, i = 0; p[i]; i++) { 424eda14cbcSMatt Macy s1 = ib - sz; 425eda14cbcSMatt Macy s2 = p[i]; 426eda14cbcSMatt Macy while (s1 < ib) { 427eda14cbcSMatt Macy if (*s1 != *s2 || *s2 == '\0') 428eda14cbcSMatt Macy break; 429eda14cbcSMatt Macy s1++; 430eda14cbcSMatt Macy s2++; 431eda14cbcSMatt Macy } 432eda14cbcSMatt Macy 433eda14cbcSMatt Macy if (s1 >= ib && *s2 == '\0') { 434eda14cbcSMatt Macy *errnum = EBADF; 435eda14cbcSMatt Macy return (-1); 436eda14cbcSMatt Macy } 437eda14cbcSMatt Macy } 438eda14cbcSMatt Macy } 439eda14cbcSMatt Macy 440eda14cbcSMatt Macy if (no_need_to_validate_entire) 441eda14cbcSMatt Macy break; 442eda14cbcSMatt Macy } 443eda14cbcSMatt Macy 444eda14cbcSMatt Macy return (ret_val); 445eda14cbcSMatt Macy } 446eda14cbcSMatt Macy 447eda14cbcSMatt Macy /* 448eda14cbcSMatt Macy * The do_case_conv() looks at the mapping tables and returns found 449eda14cbcSMatt Macy * bytes if any. If not found, the input bytes are returned. The function 450eda14cbcSMatt Macy * always terminate the return bytes with a null character assuming that 451eda14cbcSMatt Macy * there are plenty of room to do so. 452eda14cbcSMatt Macy * 453eda14cbcSMatt Macy * The case conversions are simple case conversions mapping a character to 454eda14cbcSMatt Macy * another character as specified in the Unicode data. The byte size of 455eda14cbcSMatt Macy * the mapped character could be different from that of the input character. 456eda14cbcSMatt Macy * 457eda14cbcSMatt Macy * The return value is the byte length of the returned character excluding 458eda14cbcSMatt Macy * the terminating null byte. 459eda14cbcSMatt Macy */ 460eda14cbcSMatt Macy static size_t 461eda14cbcSMatt Macy do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) 462eda14cbcSMatt Macy { 463eda14cbcSMatt Macy size_t i; 464eda14cbcSMatt Macy uint16_t b1 = 0; 465eda14cbcSMatt Macy uint16_t b2 = 0; 466eda14cbcSMatt Macy uint16_t b3 = 0; 467eda14cbcSMatt Macy uint16_t b3_tbl; 468eda14cbcSMatt Macy uint16_t b3_base; 469eda14cbcSMatt Macy uint16_t b4 = 0; 470eda14cbcSMatt Macy size_t start_id; 471eda14cbcSMatt Macy size_t end_id; 472eda14cbcSMatt Macy 473eda14cbcSMatt Macy /* 474eda14cbcSMatt Macy * At this point, the only possible values for sz are 2, 3, and 4. 475eda14cbcSMatt Macy * The u8s should point to a vector that is well beyond the size of 476eda14cbcSMatt Macy * 5 bytes. 477eda14cbcSMatt Macy */ 478eda14cbcSMatt Macy if (sz == 2) { 479eda14cbcSMatt Macy b3 = u8s[0] = s[0]; 480eda14cbcSMatt Macy b4 = u8s[1] = s[1]; 481eda14cbcSMatt Macy } else if (sz == 3) { 482eda14cbcSMatt Macy b2 = u8s[0] = s[0]; 483eda14cbcSMatt Macy b3 = u8s[1] = s[1]; 484eda14cbcSMatt Macy b4 = u8s[2] = s[2]; 485eda14cbcSMatt Macy } else if (sz == 4) { 486eda14cbcSMatt Macy b1 = u8s[0] = s[0]; 487eda14cbcSMatt Macy b2 = u8s[1] = s[1]; 488eda14cbcSMatt Macy b3 = u8s[2] = s[2]; 489eda14cbcSMatt Macy b4 = u8s[3] = s[3]; 490eda14cbcSMatt Macy } else { 491eda14cbcSMatt Macy /* This is not possible but just in case as a fallback. */ 492eda14cbcSMatt Macy if (is_it_toupper) 493eda14cbcSMatt Macy *u8s = U8_ASCII_TOUPPER(*s); 494eda14cbcSMatt Macy else 495eda14cbcSMatt Macy *u8s = U8_ASCII_TOLOWER(*s); 496eda14cbcSMatt Macy u8s[1] = '\0'; 497eda14cbcSMatt Macy 498eda14cbcSMatt Macy return (1); 499eda14cbcSMatt Macy } 500eda14cbcSMatt Macy u8s[sz] = '\0'; 501eda14cbcSMatt Macy 502eda14cbcSMatt Macy /* 503eda14cbcSMatt Macy * Let's find out if we have a corresponding character. 504eda14cbcSMatt Macy */ 505eda14cbcSMatt Macy b1 = u8_common_b1_tbl[uv][b1]; 506eda14cbcSMatt Macy if (b1 == U8_TBL_ELEMENT_NOT_DEF) 507eda14cbcSMatt Macy return ((size_t)sz); 508eda14cbcSMatt Macy 509eda14cbcSMatt Macy b2 = u8_case_common_b2_tbl[uv][b1][b2]; 510eda14cbcSMatt Macy if (b2 == U8_TBL_ELEMENT_NOT_DEF) 511eda14cbcSMatt Macy return ((size_t)sz); 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy if (is_it_toupper) { 514eda14cbcSMatt Macy b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; 515eda14cbcSMatt Macy if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 516eda14cbcSMatt Macy return ((size_t)sz); 517eda14cbcSMatt Macy 518eda14cbcSMatt Macy start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; 519eda14cbcSMatt Macy end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; 520eda14cbcSMatt Macy 521eda14cbcSMatt Macy /* Either there is no match or an error at the table. */ 522eda14cbcSMatt Macy if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 523eda14cbcSMatt Macy return ((size_t)sz); 524eda14cbcSMatt Macy 525eda14cbcSMatt Macy b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; 526eda14cbcSMatt Macy 527eda14cbcSMatt Macy for (i = 0; start_id < end_id; start_id++) 528eda14cbcSMatt Macy u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; 529eda14cbcSMatt Macy } else { 530*5c65a0a9SMartin Matuska #ifdef U8_STRCMP_CI_LOWER 531eda14cbcSMatt Macy b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; 532eda14cbcSMatt Macy if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 533eda14cbcSMatt Macy return ((size_t)sz); 534eda14cbcSMatt Macy 535eda14cbcSMatt Macy start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; 536eda14cbcSMatt Macy end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; 537eda14cbcSMatt Macy 538eda14cbcSMatt Macy if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 539eda14cbcSMatt Macy return ((size_t)sz); 540eda14cbcSMatt Macy 541eda14cbcSMatt Macy b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; 542eda14cbcSMatt Macy 543eda14cbcSMatt Macy for (i = 0; start_id < end_id; start_id++) 544eda14cbcSMatt Macy u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; 545*5c65a0a9SMartin Matuska #else 546*5c65a0a9SMartin Matuska __builtin_unreachable(); 547*5c65a0a9SMartin Matuska #endif 548eda14cbcSMatt Macy } 549eda14cbcSMatt Macy 550eda14cbcSMatt Macy /* 551eda14cbcSMatt Macy * If i is still zero, that means there is no corresponding character. 552eda14cbcSMatt Macy */ 553eda14cbcSMatt Macy if (i == 0) 554eda14cbcSMatt Macy return ((size_t)sz); 555eda14cbcSMatt Macy 556eda14cbcSMatt Macy u8s[i] = '\0'; 557eda14cbcSMatt Macy 558eda14cbcSMatt Macy return (i); 559eda14cbcSMatt Macy } 560eda14cbcSMatt Macy 561eda14cbcSMatt Macy /* 562eda14cbcSMatt Macy * The do_case_compare() function compares the two input strings, s1 and s2, 563eda14cbcSMatt Macy * one character at a time doing case conversions if applicable and return 564eda14cbcSMatt Macy * the comparison result as like strcmp(). 565eda14cbcSMatt Macy * 566eda14cbcSMatt Macy * Since, in empirical sense, most of text data are 7-bit ASCII characters, 567eda14cbcSMatt Macy * we treat the 7-bit ASCII characters as a special case trying to yield 568eda14cbcSMatt Macy * faster processing time. 569eda14cbcSMatt Macy */ 570eda14cbcSMatt Macy static int 571eda14cbcSMatt Macy do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, 572eda14cbcSMatt Macy size_t n2, boolean_t is_it_toupper, int *errnum) 573eda14cbcSMatt Macy { 574eda14cbcSMatt Macy int f; 575eda14cbcSMatt Macy int sz1; 576eda14cbcSMatt Macy int sz2; 577eda14cbcSMatt Macy size_t j; 578eda14cbcSMatt Macy size_t i1; 579eda14cbcSMatt Macy size_t i2; 580eda14cbcSMatt Macy uchar_t u8s1[U8_MB_CUR_MAX + 1]; 581eda14cbcSMatt Macy uchar_t u8s2[U8_MB_CUR_MAX + 1]; 582eda14cbcSMatt Macy 583eda14cbcSMatt Macy i1 = i2 = 0; 584eda14cbcSMatt Macy while (i1 < n1 && i2 < n2) { 585eda14cbcSMatt Macy /* 586eda14cbcSMatt Macy * Find out what would be the byte length for this UTF-8 587eda14cbcSMatt Macy * character at string s1 and also find out if this is 588eda14cbcSMatt Macy * an illegal start byte or not and if so, issue a proper 589eda14cbcSMatt Macy * error number and yet treat this byte as a character. 590eda14cbcSMatt Macy */ 591eda14cbcSMatt Macy sz1 = u8_number_of_bytes[*s1]; 592eda14cbcSMatt Macy if (sz1 < 0) { 593eda14cbcSMatt Macy *errnum = EILSEQ; 594eda14cbcSMatt Macy sz1 = 1; 595eda14cbcSMatt Macy } 596eda14cbcSMatt Macy 597eda14cbcSMatt Macy /* 598eda14cbcSMatt Macy * For 7-bit ASCII characters mainly, we do a quick case 599eda14cbcSMatt Macy * conversion right at here. 600eda14cbcSMatt Macy * 601eda14cbcSMatt Macy * If we don't have enough bytes for this character, issue 602eda14cbcSMatt Macy * an EINVAL error and use what are available. 603eda14cbcSMatt Macy * 604eda14cbcSMatt Macy * If we have enough bytes, find out if there is 605eda14cbcSMatt Macy * a corresponding uppercase character and if so, copy over 606eda14cbcSMatt Macy * the bytes for a comparison later. If there is no 607eda14cbcSMatt Macy * corresponding uppercase character, then, use what we have 608eda14cbcSMatt Macy * for the comparison. 609eda14cbcSMatt Macy */ 610eda14cbcSMatt Macy if (sz1 == 1) { 611eda14cbcSMatt Macy if (is_it_toupper) 612eda14cbcSMatt Macy u8s1[0] = U8_ASCII_TOUPPER(*s1); 613eda14cbcSMatt Macy else 614eda14cbcSMatt Macy u8s1[0] = U8_ASCII_TOLOWER(*s1); 615eda14cbcSMatt Macy s1++; 616eda14cbcSMatt Macy u8s1[1] = '\0'; 617eda14cbcSMatt Macy } else if ((i1 + sz1) > n1) { 618eda14cbcSMatt Macy *errnum = EINVAL; 619eda14cbcSMatt Macy for (j = 0; (i1 + j) < n1; ) 620eda14cbcSMatt Macy u8s1[j++] = *s1++; 621eda14cbcSMatt Macy u8s1[j] = '\0'; 622eda14cbcSMatt Macy } else { 623eda14cbcSMatt Macy (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); 624eda14cbcSMatt Macy s1 += sz1; 625eda14cbcSMatt Macy } 626eda14cbcSMatt Macy 627eda14cbcSMatt Macy /* Do the same for the string s2. */ 628eda14cbcSMatt Macy sz2 = u8_number_of_bytes[*s2]; 629eda14cbcSMatt Macy if (sz2 < 0) { 630eda14cbcSMatt Macy *errnum = EILSEQ; 631eda14cbcSMatt Macy sz2 = 1; 632eda14cbcSMatt Macy } 633eda14cbcSMatt Macy 634eda14cbcSMatt Macy if (sz2 == 1) { 635eda14cbcSMatt Macy if (is_it_toupper) 636eda14cbcSMatt Macy u8s2[0] = U8_ASCII_TOUPPER(*s2); 637eda14cbcSMatt Macy else 638eda14cbcSMatt Macy u8s2[0] = U8_ASCII_TOLOWER(*s2); 639eda14cbcSMatt Macy s2++; 640eda14cbcSMatt Macy u8s2[1] = '\0'; 641eda14cbcSMatt Macy } else if ((i2 + sz2) > n2) { 642eda14cbcSMatt Macy *errnum = EINVAL; 643eda14cbcSMatt Macy for (j = 0; (i2 + j) < n2; ) 644eda14cbcSMatt Macy u8s2[j++] = *s2++; 645eda14cbcSMatt Macy u8s2[j] = '\0'; 646eda14cbcSMatt Macy } else { 647eda14cbcSMatt Macy (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); 648eda14cbcSMatt Macy s2 += sz2; 649eda14cbcSMatt Macy } 650eda14cbcSMatt Macy 651eda14cbcSMatt Macy /* Now compare the two characters. */ 652eda14cbcSMatt Macy if (sz1 == 1 && sz2 == 1) { 653eda14cbcSMatt Macy if (*u8s1 > *u8s2) 654eda14cbcSMatt Macy return (1); 655eda14cbcSMatt Macy if (*u8s1 < *u8s2) 656eda14cbcSMatt Macy return (-1); 657eda14cbcSMatt Macy } else { 658eda14cbcSMatt Macy f = strcmp((const char *)u8s1, (const char *)u8s2); 659eda14cbcSMatt Macy if (f != 0) 660eda14cbcSMatt Macy return (f); 661eda14cbcSMatt Macy } 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy /* 664eda14cbcSMatt Macy * They were the same. Let's move on to the next 665eda14cbcSMatt Macy * characters then. 666eda14cbcSMatt Macy */ 667eda14cbcSMatt Macy i1 += sz1; 668eda14cbcSMatt Macy i2 += sz2; 669eda14cbcSMatt Macy } 670eda14cbcSMatt Macy 671eda14cbcSMatt Macy /* 672eda14cbcSMatt Macy * We compared until the end of either or both strings. 673eda14cbcSMatt Macy * 674eda14cbcSMatt Macy * If we reached to or went over the ends for the both, that means 675eda14cbcSMatt Macy * they are the same. 676eda14cbcSMatt Macy * 677eda14cbcSMatt Macy * If we reached only one of the two ends, that means the other string 678eda14cbcSMatt Macy * has something which then the fact can be used to determine 679eda14cbcSMatt Macy * the return value. 680eda14cbcSMatt Macy */ 681eda14cbcSMatt Macy if (i1 >= n1) { 682eda14cbcSMatt Macy if (i2 >= n2) 683eda14cbcSMatt Macy return (0); 684eda14cbcSMatt Macy return (-1); 685eda14cbcSMatt Macy } 686eda14cbcSMatt Macy return (1); 687eda14cbcSMatt Macy } 688eda14cbcSMatt Macy 689eda14cbcSMatt Macy /* 690eda14cbcSMatt Macy * The combining_class() function checks on the given bytes and find out 691eda14cbcSMatt Macy * the corresponding Unicode combining class value. The return value 0 means 692eda14cbcSMatt Macy * it is a Starter. Any illegal UTF-8 character will also be treated as 693eda14cbcSMatt Macy * a Starter. 694eda14cbcSMatt Macy */ 695eda14cbcSMatt Macy static uchar_t 696eda14cbcSMatt Macy combining_class(size_t uv, uchar_t *s, size_t sz) 697eda14cbcSMatt Macy { 698eda14cbcSMatt Macy uint16_t b1 = 0; 699eda14cbcSMatt Macy uint16_t b2 = 0; 700eda14cbcSMatt Macy uint16_t b3 = 0; 701eda14cbcSMatt Macy uint16_t b4 = 0; 702eda14cbcSMatt Macy 703eda14cbcSMatt Macy if (sz == 1 || sz > 4) 704eda14cbcSMatt Macy return (0); 705eda14cbcSMatt Macy 706eda14cbcSMatt Macy if (sz == 2) { 707eda14cbcSMatt Macy b3 = s[0]; 708eda14cbcSMatt Macy b4 = s[1]; 709eda14cbcSMatt Macy } else if (sz == 3) { 710eda14cbcSMatt Macy b2 = s[0]; 711eda14cbcSMatt Macy b3 = s[1]; 712eda14cbcSMatt Macy b4 = s[2]; 713eda14cbcSMatt Macy } else if (sz == 4) { 714eda14cbcSMatt Macy b1 = s[0]; 715eda14cbcSMatt Macy b2 = s[1]; 716eda14cbcSMatt Macy b3 = s[2]; 717eda14cbcSMatt Macy b4 = s[3]; 718eda14cbcSMatt Macy } 719eda14cbcSMatt Macy 720eda14cbcSMatt Macy b1 = u8_common_b1_tbl[uv][b1]; 721eda14cbcSMatt Macy if (b1 == U8_TBL_ELEMENT_NOT_DEF) 722eda14cbcSMatt Macy return (0); 723eda14cbcSMatt Macy 724eda14cbcSMatt Macy b2 = u8_combining_class_b2_tbl[uv][b1][b2]; 725eda14cbcSMatt Macy if (b2 == U8_TBL_ELEMENT_NOT_DEF) 726eda14cbcSMatt Macy return (0); 727eda14cbcSMatt Macy 728eda14cbcSMatt Macy b3 = u8_combining_class_b3_tbl[uv][b2][b3]; 729eda14cbcSMatt Macy if (b3 == U8_TBL_ELEMENT_NOT_DEF) 730eda14cbcSMatt Macy return (0); 731eda14cbcSMatt Macy 732eda14cbcSMatt Macy return (u8_combining_class_b4_tbl[uv][b3][b4]); 733eda14cbcSMatt Macy } 734eda14cbcSMatt Macy 735eda14cbcSMatt Macy /* 736eda14cbcSMatt Macy * The do_decomp() function finds out a matching decomposition if any 737eda14cbcSMatt Macy * and return. If there is no match, the input bytes are copied and returned. 738eda14cbcSMatt Macy * The function also checks if there is a Hangul, decomposes it if necessary 739eda14cbcSMatt Macy * and returns. 740eda14cbcSMatt Macy * 741eda14cbcSMatt Macy * To save time, a single byte 7-bit ASCII character should be handled by 742eda14cbcSMatt Macy * the caller. 743eda14cbcSMatt Macy * 744eda14cbcSMatt Macy * The function returns the number of bytes returned sans always terminating 745eda14cbcSMatt Macy * the null byte. It will also return a state that will tell if there was 746eda14cbcSMatt Macy * a Hangul character decomposed which then will be used by the caller. 747eda14cbcSMatt Macy */ 748eda14cbcSMatt Macy static size_t 749eda14cbcSMatt Macy do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, 750eda14cbcSMatt Macy boolean_t canonical_decomposition, u8_normalization_states_t *state) 751eda14cbcSMatt Macy { 752eda14cbcSMatt Macy uint16_t b1 = 0; 753eda14cbcSMatt Macy uint16_t b2 = 0; 754eda14cbcSMatt Macy uint16_t b3 = 0; 755eda14cbcSMatt Macy uint16_t b3_tbl; 756eda14cbcSMatt Macy uint16_t b3_base; 757eda14cbcSMatt Macy uint16_t b4 = 0; 758eda14cbcSMatt Macy size_t start_id; 759eda14cbcSMatt Macy size_t end_id; 760eda14cbcSMatt Macy size_t i; 761eda14cbcSMatt Macy uint32_t u1; 762eda14cbcSMatt Macy 763eda14cbcSMatt Macy if (sz == 2) { 764eda14cbcSMatt Macy b3 = u8s[0] = s[0]; 765eda14cbcSMatt Macy b4 = u8s[1] = s[1]; 766eda14cbcSMatt Macy u8s[2] = '\0'; 767eda14cbcSMatt Macy } else if (sz == 3) { 768eda14cbcSMatt Macy /* Convert it to a Unicode scalar value. */ 769eda14cbcSMatt Macy U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); 770eda14cbcSMatt Macy 771eda14cbcSMatt Macy /* 772eda14cbcSMatt Macy * If this is a Hangul syllable, we decompose it into 773eda14cbcSMatt Macy * a leading consonant, a vowel, and an optional trailing 774eda14cbcSMatt Macy * consonant and then return. 775eda14cbcSMatt Macy */ 776eda14cbcSMatt Macy if (U8_HANGUL_SYLLABLE(u1)) { 777eda14cbcSMatt Macy u1 -= U8_HANGUL_SYL_FIRST; 778eda14cbcSMatt Macy 779eda14cbcSMatt Macy b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; 780eda14cbcSMatt Macy b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) 781eda14cbcSMatt Macy / U8_HANGUL_T_COUNT; 782eda14cbcSMatt Macy b3 = u1 % U8_HANGUL_T_COUNT; 783eda14cbcSMatt Macy 784eda14cbcSMatt Macy U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); 785eda14cbcSMatt Macy U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); 786eda14cbcSMatt Macy if (b3) { 787eda14cbcSMatt Macy b3 += U8_HANGUL_JAMO_T_FIRST; 788eda14cbcSMatt Macy U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); 789eda14cbcSMatt Macy 790eda14cbcSMatt Macy u8s[9] = '\0'; 791eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LVT; 792eda14cbcSMatt Macy return (9); 793eda14cbcSMatt Macy } 794eda14cbcSMatt Macy 795eda14cbcSMatt Macy u8s[6] = '\0'; 796eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LV; 797eda14cbcSMatt Macy return (6); 798eda14cbcSMatt Macy } 799eda14cbcSMatt Macy 800eda14cbcSMatt Macy b2 = u8s[0] = s[0]; 801eda14cbcSMatt Macy b3 = u8s[1] = s[1]; 802eda14cbcSMatt Macy b4 = u8s[2] = s[2]; 803eda14cbcSMatt Macy u8s[3] = '\0'; 804eda14cbcSMatt Macy 805eda14cbcSMatt Macy /* 806eda14cbcSMatt Macy * If this is a Hangul Jamo, we know there is nothing 807eda14cbcSMatt Macy * further that we can decompose. 808eda14cbcSMatt Macy */ 809eda14cbcSMatt Macy if (U8_HANGUL_JAMO_L(u1)) { 810eda14cbcSMatt Macy *state = U8_STATE_HANGUL_L; 811eda14cbcSMatt Macy return (3); 812eda14cbcSMatt Macy } 813eda14cbcSMatt Macy 814eda14cbcSMatt Macy if (U8_HANGUL_JAMO_V(u1)) { 815eda14cbcSMatt Macy if (*state == U8_STATE_HANGUL_L) 816eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LV; 817eda14cbcSMatt Macy else 818eda14cbcSMatt Macy *state = U8_STATE_HANGUL_V; 819eda14cbcSMatt Macy return (3); 820eda14cbcSMatt Macy } 821eda14cbcSMatt Macy 822eda14cbcSMatt Macy if (U8_HANGUL_JAMO_T(u1)) { 823eda14cbcSMatt Macy if (*state == U8_STATE_HANGUL_LV) 824eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LVT; 825eda14cbcSMatt Macy else 826eda14cbcSMatt Macy *state = U8_STATE_HANGUL_T; 827eda14cbcSMatt Macy return (3); 828eda14cbcSMatt Macy } 829eda14cbcSMatt Macy } else if (sz == 4) { 830eda14cbcSMatt Macy b1 = u8s[0] = s[0]; 831eda14cbcSMatt Macy b2 = u8s[1] = s[1]; 832eda14cbcSMatt Macy b3 = u8s[2] = s[2]; 833eda14cbcSMatt Macy b4 = u8s[3] = s[3]; 834eda14cbcSMatt Macy u8s[4] = '\0'; 835eda14cbcSMatt Macy } else { 836eda14cbcSMatt Macy /* 837eda14cbcSMatt Macy * This is a fallback and should not happen if the function 838eda14cbcSMatt Macy * was called properly. 839eda14cbcSMatt Macy */ 840eda14cbcSMatt Macy u8s[0] = s[0]; 841eda14cbcSMatt Macy u8s[1] = '\0'; 842eda14cbcSMatt Macy *state = U8_STATE_START; 843eda14cbcSMatt Macy return (1); 844eda14cbcSMatt Macy } 845eda14cbcSMatt Macy 846eda14cbcSMatt Macy /* 847eda14cbcSMatt Macy * At this point, this routine does not know what it would get. 848eda14cbcSMatt Macy * The caller should sort it out if the state isn't a Hangul one. 849eda14cbcSMatt Macy */ 850eda14cbcSMatt Macy *state = U8_STATE_START; 851eda14cbcSMatt Macy 852eda14cbcSMatt Macy /* Try to find matching decomposition mapping byte sequence. */ 853eda14cbcSMatt Macy b1 = u8_common_b1_tbl[uv][b1]; 854eda14cbcSMatt Macy if (b1 == U8_TBL_ELEMENT_NOT_DEF) 855eda14cbcSMatt Macy return ((size_t)sz); 856eda14cbcSMatt Macy 857eda14cbcSMatt Macy b2 = u8_decomp_b2_tbl[uv][b1][b2]; 858eda14cbcSMatt Macy if (b2 == U8_TBL_ELEMENT_NOT_DEF) 859eda14cbcSMatt Macy return ((size_t)sz); 860eda14cbcSMatt Macy 861eda14cbcSMatt Macy b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; 862eda14cbcSMatt Macy if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 863eda14cbcSMatt Macy return ((size_t)sz); 864eda14cbcSMatt Macy 865eda14cbcSMatt Macy /* 866eda14cbcSMatt Macy * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR 867eda14cbcSMatt Macy * which is 0x8000, this means we couldn't fit the mappings into 868eda14cbcSMatt Macy * the cardinality of a unsigned byte. 869eda14cbcSMatt Macy */ 870eda14cbcSMatt Macy if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 871eda14cbcSMatt Macy b3_tbl -= U8_16BIT_TABLE_INDICATOR; 872eda14cbcSMatt Macy start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; 873eda14cbcSMatt Macy end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 874eda14cbcSMatt Macy } else { 87581b22a98SMartin Matuska // cppcheck-suppress arrayIndexOutOfBoundsCond 876eda14cbcSMatt Macy start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; 87781b22a98SMartin Matuska // cppcheck-suppress arrayIndexOutOfBoundsCond 878eda14cbcSMatt Macy end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; 879eda14cbcSMatt Macy } 880eda14cbcSMatt Macy 881eda14cbcSMatt Macy /* This also means there wasn't any matching decomposition. */ 882eda14cbcSMatt Macy if (start_id >= end_id) 883eda14cbcSMatt Macy return ((size_t)sz); 884eda14cbcSMatt Macy 885eda14cbcSMatt Macy /* 886eda14cbcSMatt Macy * The final table for decomposition mappings has three types of 887eda14cbcSMatt Macy * byte sequences depending on whether a mapping is for compatibility 888eda14cbcSMatt Macy * decomposition, canonical decomposition, or both like the following: 889eda14cbcSMatt Macy * 890eda14cbcSMatt Macy * (1) Compatibility decomposition mappings: 891eda14cbcSMatt Macy * 892eda14cbcSMatt Macy * +---+---+-...-+---+ 893eda14cbcSMatt Macy * | B0| B1| ... | Bm| 894eda14cbcSMatt Macy * +---+---+-...-+---+ 895eda14cbcSMatt Macy * 89616038816SMartin Matuska * The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH). 897eda14cbcSMatt Macy * 898eda14cbcSMatt Macy * (2) Canonical decomposition mappings: 899eda14cbcSMatt Macy * 900eda14cbcSMatt Macy * +---+---+---+-...-+---+ 901eda14cbcSMatt Macy * | T | b0| b1| ... | bn| 902eda14cbcSMatt Macy * +---+---+---+-...-+---+ 903eda14cbcSMatt Macy * 904eda14cbcSMatt Macy * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). 905eda14cbcSMatt Macy * 906eda14cbcSMatt Macy * (3) Both mappings: 907eda14cbcSMatt Macy * 908eda14cbcSMatt Macy * +---+---+---+---+-...-+---+---+---+-...-+---+ 909eda14cbcSMatt Macy * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| 910eda14cbcSMatt Macy * +---+---+---+---+-...-+---+---+---+-...-+---+ 911eda14cbcSMatt Macy * 912eda14cbcSMatt Macy * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement 913eda14cbcSMatt Macy * byte, b0 to bn are canonical mapping bytes and B0 to Bm are 914eda14cbcSMatt Macy * compatibility mapping bytes. 915eda14cbcSMatt Macy * 916eda14cbcSMatt Macy * Note that compatibility decomposition means doing recursive 917eda14cbcSMatt Macy * decompositions using both compatibility decomposition mappings and 918eda14cbcSMatt Macy * canonical decomposition mappings. On the other hand, canonical 919eda14cbcSMatt Macy * decomposition means doing recursive decompositions using only 920eda14cbcSMatt Macy * canonical decomposition mappings. Since the table we have has gone 921eda14cbcSMatt Macy * through the recursions already, we do not need to do so during 922eda14cbcSMatt Macy * runtime, i.e., the table has been completely flattened out 923eda14cbcSMatt Macy * already. 924eda14cbcSMatt Macy */ 925eda14cbcSMatt Macy 926eda14cbcSMatt Macy b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; 927eda14cbcSMatt Macy 928eda14cbcSMatt Macy /* Get the type, T, of the byte sequence. */ 929eda14cbcSMatt Macy b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; 930eda14cbcSMatt Macy 931eda14cbcSMatt Macy /* 932eda14cbcSMatt Macy * If necessary, adjust start_id, end_id, or both. Note that if 933eda14cbcSMatt Macy * this is compatibility decomposition mapping, there is no 934eda14cbcSMatt Macy * adjustment. 935eda14cbcSMatt Macy */ 936eda14cbcSMatt Macy if (canonical_decomposition) { 937eda14cbcSMatt Macy /* Is the mapping only for compatibility decomposition? */ 938eda14cbcSMatt Macy if (b1 < U8_DECOMP_BOTH) 939eda14cbcSMatt Macy return ((size_t)sz); 940eda14cbcSMatt Macy 941eda14cbcSMatt Macy start_id++; 942eda14cbcSMatt Macy 943eda14cbcSMatt Macy if (b1 == U8_DECOMP_BOTH) { 944eda14cbcSMatt Macy end_id = start_id + 945eda14cbcSMatt Macy u8_decomp_final_tbl[uv][b3_base + start_id]; 946eda14cbcSMatt Macy start_id++; 947eda14cbcSMatt Macy } 948eda14cbcSMatt Macy } else { 949eda14cbcSMatt Macy /* 950eda14cbcSMatt Macy * Unless this is a compatibility decomposition mapping, 951eda14cbcSMatt Macy * we adjust the start_id. 952eda14cbcSMatt Macy */ 953eda14cbcSMatt Macy if (b1 == U8_DECOMP_BOTH) { 954eda14cbcSMatt Macy start_id++; 955eda14cbcSMatt Macy start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; 956eda14cbcSMatt Macy } else if (b1 == U8_DECOMP_CANONICAL) { 957eda14cbcSMatt Macy start_id++; 958eda14cbcSMatt Macy } 959eda14cbcSMatt Macy } 960eda14cbcSMatt Macy 961eda14cbcSMatt Macy for (i = 0; start_id < end_id; start_id++) 962eda14cbcSMatt Macy u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; 963eda14cbcSMatt Macy u8s[i] = '\0'; 964eda14cbcSMatt Macy 965eda14cbcSMatt Macy return (i); 966eda14cbcSMatt Macy } 967eda14cbcSMatt Macy 968eda14cbcSMatt Macy /* 969eda14cbcSMatt Macy * The find_composition_start() function uses the character bytes given and 970eda14cbcSMatt Macy * find out the matching composition mappings if any and return the address 971eda14cbcSMatt Macy * to the composition mappings as explained in the do_composition(). 972eda14cbcSMatt Macy */ 973eda14cbcSMatt Macy static uchar_t * 974eda14cbcSMatt Macy find_composition_start(size_t uv, uchar_t *s, size_t sz) 975eda14cbcSMatt Macy { 976eda14cbcSMatt Macy uint16_t b1 = 0; 977eda14cbcSMatt Macy uint16_t b2 = 0; 978eda14cbcSMatt Macy uint16_t b3 = 0; 979eda14cbcSMatt Macy uint16_t b3_tbl; 980eda14cbcSMatt Macy uint16_t b3_base; 981eda14cbcSMatt Macy uint16_t b4 = 0; 982eda14cbcSMatt Macy size_t start_id; 983eda14cbcSMatt Macy size_t end_id; 984eda14cbcSMatt Macy 985eda14cbcSMatt Macy if (sz == 1) { 986eda14cbcSMatt Macy b4 = s[0]; 987eda14cbcSMatt Macy } else if (sz == 2) { 988eda14cbcSMatt Macy b3 = s[0]; 989eda14cbcSMatt Macy b4 = s[1]; 990eda14cbcSMatt Macy } else if (sz == 3) { 991eda14cbcSMatt Macy b2 = s[0]; 992eda14cbcSMatt Macy b3 = s[1]; 993eda14cbcSMatt Macy b4 = s[2]; 994eda14cbcSMatt Macy } else if (sz == 4) { 995eda14cbcSMatt Macy b1 = s[0]; 996eda14cbcSMatt Macy b2 = s[1]; 997eda14cbcSMatt Macy b3 = s[2]; 998eda14cbcSMatt Macy b4 = s[3]; 999eda14cbcSMatt Macy } else { 1000eda14cbcSMatt Macy /* 1001eda14cbcSMatt Macy * This is a fallback and should not happen if the function 1002eda14cbcSMatt Macy * was called properly. 1003eda14cbcSMatt Macy */ 1004eda14cbcSMatt Macy return (NULL); 1005eda14cbcSMatt Macy } 1006eda14cbcSMatt Macy 1007eda14cbcSMatt Macy b1 = u8_composition_b1_tbl[uv][b1]; 1008eda14cbcSMatt Macy if (b1 == U8_TBL_ELEMENT_NOT_DEF) 1009eda14cbcSMatt Macy return (NULL); 1010eda14cbcSMatt Macy 1011eda14cbcSMatt Macy b2 = u8_composition_b2_tbl[uv][b1][b2]; 1012eda14cbcSMatt Macy if (b2 == U8_TBL_ELEMENT_NOT_DEF) 1013eda14cbcSMatt Macy return (NULL); 1014eda14cbcSMatt Macy 1015eda14cbcSMatt Macy b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; 1016eda14cbcSMatt Macy if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 1017eda14cbcSMatt Macy return (NULL); 1018eda14cbcSMatt Macy 1019eda14cbcSMatt Macy if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 1020eda14cbcSMatt Macy b3_tbl -= U8_16BIT_TABLE_INDICATOR; 1021eda14cbcSMatt Macy start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; 1022eda14cbcSMatt Macy end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 1023eda14cbcSMatt Macy } else { 102481b22a98SMartin Matuska // cppcheck-suppress arrayIndexOutOfBoundsCond 1025eda14cbcSMatt Macy start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; 102681b22a98SMartin Matuska // cppcheck-suppress arrayIndexOutOfBoundsCond 1027eda14cbcSMatt Macy end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; 1028eda14cbcSMatt Macy } 1029eda14cbcSMatt Macy 1030eda14cbcSMatt Macy if (start_id >= end_id) 1031eda14cbcSMatt Macy return (NULL); 1032eda14cbcSMatt Macy 1033eda14cbcSMatt Macy b3_base = u8_composition_b3_tbl[uv][b2][b3].base; 1034eda14cbcSMatt Macy 1035eda14cbcSMatt Macy return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); 1036eda14cbcSMatt Macy } 1037eda14cbcSMatt Macy 1038eda14cbcSMatt Macy /* 1039eda14cbcSMatt Macy * The blocked() function checks on the combining class values of previous 1040eda14cbcSMatt Macy * characters in this sequence and return whether it is blocked or not. 1041eda14cbcSMatt Macy */ 1042eda14cbcSMatt Macy static boolean_t 1043eda14cbcSMatt Macy blocked(uchar_t *comb_class, size_t last) 1044eda14cbcSMatt Macy { 1045eda14cbcSMatt Macy uchar_t my_comb_class; 1046eda14cbcSMatt Macy size_t i; 1047eda14cbcSMatt Macy 1048eda14cbcSMatt Macy my_comb_class = comb_class[last]; 1049eda14cbcSMatt Macy for (i = 1; i < last; i++) 1050eda14cbcSMatt Macy if (comb_class[i] >= my_comb_class || 1051eda14cbcSMatt Macy comb_class[i] == U8_COMBINING_CLASS_STARTER) 1052eda14cbcSMatt Macy return (B_TRUE); 1053eda14cbcSMatt Macy 1054eda14cbcSMatt Macy return (B_FALSE); 1055eda14cbcSMatt Macy } 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy /* 1058eda14cbcSMatt Macy * The do_composition() reads the character string pointed by 's' and 1059eda14cbcSMatt Macy * do necessary canonical composition and then copy over the result back to 1060eda14cbcSMatt Macy * the 's'. 1061eda14cbcSMatt Macy * 1062eda14cbcSMatt Macy * The input argument 's' cannot contain more than 32 characters. 1063eda14cbcSMatt Macy */ 1064eda14cbcSMatt Macy static size_t 1065eda14cbcSMatt Macy do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, 1066eda14cbcSMatt Macy uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) 1067eda14cbcSMatt Macy { 1068eda14cbcSMatt Macy uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; 1069eda14cbcSMatt Macy uchar_t tc[U8_MB_CUR_MAX] = { '\0' }; 1070eda14cbcSMatt Macy uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; 1071eda14cbcSMatt Macy size_t saved_marks_count; 1072eda14cbcSMatt Macy uchar_t *p; 1073eda14cbcSMatt Macy uchar_t *saved_p; 1074eda14cbcSMatt Macy uchar_t *q; 1075eda14cbcSMatt Macy size_t i; 1076eda14cbcSMatt Macy size_t saved_i; 1077eda14cbcSMatt Macy size_t j; 1078eda14cbcSMatt Macy size_t k; 1079eda14cbcSMatt Macy size_t l; 1080eda14cbcSMatt Macy size_t C; 1081eda14cbcSMatt Macy size_t saved_l; 1082eda14cbcSMatt Macy size_t size; 1083eda14cbcSMatt Macy uint32_t u1; 1084eda14cbcSMatt Macy uint32_t u2; 1085eda14cbcSMatt Macy boolean_t match_not_found = B_TRUE; 1086eda14cbcSMatt Macy 1087eda14cbcSMatt Macy /* 1088eda14cbcSMatt Macy * This should never happen unless the callers are doing some strange 1089eda14cbcSMatt Macy * and unexpected things. 1090eda14cbcSMatt Macy * 1091eda14cbcSMatt Macy * The "last" is the index pointing to the last character not last + 1. 1092eda14cbcSMatt Macy */ 1093eda14cbcSMatt Macy if (last >= U8_MAX_CHARS_A_SEQ) 1094eda14cbcSMatt Macy last = U8_UPPER_LIMIT_IN_A_SEQ; 1095eda14cbcSMatt Macy 1096eda14cbcSMatt Macy for (i = l = 0; i <= last; i++) { 1097eda14cbcSMatt Macy /* 1098eda14cbcSMatt Macy * The last or any non-Starters at the beginning, we don't 1099eda14cbcSMatt Macy * have any chance to do composition and so we just copy them 1100eda14cbcSMatt Macy * to the temporary buffer. 1101eda14cbcSMatt Macy */ 1102eda14cbcSMatt Macy if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { 1103eda14cbcSMatt Macy SAVE_THE_CHAR: 1104eda14cbcSMatt Macy p = s + start[i]; 1105eda14cbcSMatt Macy size = disp[i]; 1106eda14cbcSMatt Macy for (k = 0; k < size; k++) 1107eda14cbcSMatt Macy t[l++] = *p++; 1108eda14cbcSMatt Macy continue; 1109eda14cbcSMatt Macy } 1110eda14cbcSMatt Macy 1111eda14cbcSMatt Macy /* 1112eda14cbcSMatt Macy * If this could be a start of Hangul Jamos, then, we try to 1113eda14cbcSMatt Macy * conjoin them. 1114eda14cbcSMatt Macy */ 1115eda14cbcSMatt Macy if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { 1116eda14cbcSMatt Macy U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], 1117eda14cbcSMatt Macy s[start[i] + 1], s[start[i] + 2]); 1118eda14cbcSMatt Macy U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], 1119eda14cbcSMatt Macy s[start[i] + 4], s[start[i] + 5]); 1120eda14cbcSMatt Macy 1121eda14cbcSMatt Macy if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { 1122eda14cbcSMatt Macy u1 -= U8_HANGUL_JAMO_L_FIRST; 1123eda14cbcSMatt Macy u2 -= U8_HANGUL_JAMO_V_FIRST; 1124eda14cbcSMatt Macy u1 = U8_HANGUL_SYL_FIRST + 1125eda14cbcSMatt Macy (u1 * U8_HANGUL_V_COUNT + u2) * 1126eda14cbcSMatt Macy U8_HANGUL_T_COUNT; 1127eda14cbcSMatt Macy 1128eda14cbcSMatt Macy i += 2; 1129eda14cbcSMatt Macy if (i <= last) { 1130eda14cbcSMatt Macy U8_PUT_3BYTES_INTO_UTF32(u2, 1131eda14cbcSMatt Macy s[start[i]], s[start[i] + 1], 1132eda14cbcSMatt Macy s[start[i] + 2]); 1133eda14cbcSMatt Macy 1134eda14cbcSMatt Macy if (U8_HANGUL_JAMO_T(u2)) { 1135eda14cbcSMatt Macy u1 += u2 - 1136eda14cbcSMatt Macy U8_HANGUL_JAMO_T_FIRST; 1137eda14cbcSMatt Macy i++; 1138eda14cbcSMatt Macy } 1139eda14cbcSMatt Macy } 1140eda14cbcSMatt Macy 1141eda14cbcSMatt Macy U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); 1142eda14cbcSMatt Macy i--; 1143eda14cbcSMatt Macy l += 3; 1144eda14cbcSMatt Macy continue; 1145eda14cbcSMatt Macy } 1146eda14cbcSMatt Macy } 1147eda14cbcSMatt Macy 1148eda14cbcSMatt Macy /* 1149eda14cbcSMatt Macy * Let's then find out if this Starter has composition 1150eda14cbcSMatt Macy * mapping. 1151eda14cbcSMatt Macy */ 1152eda14cbcSMatt Macy p = find_composition_start(uv, s + start[i], disp[i]); 1153eda14cbcSMatt Macy if (p == NULL) 1154eda14cbcSMatt Macy goto SAVE_THE_CHAR; 1155eda14cbcSMatt Macy 1156eda14cbcSMatt Macy /* 1157eda14cbcSMatt Macy * We have a Starter with composition mapping and the next 1158eda14cbcSMatt Macy * character is a non-Starter. Let's try to find out if 1159eda14cbcSMatt Macy * we can do composition. 1160eda14cbcSMatt Macy */ 1161eda14cbcSMatt Macy 1162eda14cbcSMatt Macy saved_p = p; 1163eda14cbcSMatt Macy saved_i = i; 1164eda14cbcSMatt Macy saved_l = l; 1165eda14cbcSMatt Macy saved_marks_count = 0; 1166eda14cbcSMatt Macy 1167eda14cbcSMatt Macy TRY_THE_NEXT_MARK: 1168eda14cbcSMatt Macy q = s + start[++i]; 1169eda14cbcSMatt Macy size = disp[i]; 1170eda14cbcSMatt Macy 1171eda14cbcSMatt Macy /* 1172eda14cbcSMatt Macy * The next for() loop compares the non-Starter pointed by 1173eda14cbcSMatt Macy * 'q' with the possible (joinable) characters pointed by 'p'. 1174eda14cbcSMatt Macy * 1175eda14cbcSMatt Macy * The composition final table entry pointed by the 'p' 1176eda14cbcSMatt Macy * looks like the following: 1177eda14cbcSMatt Macy * 1178eda14cbcSMatt Macy * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1179eda14cbcSMatt Macy * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | 1180eda14cbcSMatt Macy * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1181eda14cbcSMatt Macy * 1182eda14cbcSMatt Macy * where C is the count byte indicating the number of 1183eda14cbcSMatt Macy * mapping pairs where each pair would be look like 1184eda14cbcSMatt Macy * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second 1185eda14cbcSMatt Macy * character of a canonical decomposition and the B0-Bm are 1186eda14cbcSMatt Macy * the bytes of a matching composite character. The F is 1187eda14cbcSMatt Macy * a filler byte after each character as the separator. 1188eda14cbcSMatt Macy */ 1189eda14cbcSMatt Macy 1190eda14cbcSMatt Macy match_not_found = B_TRUE; 1191eda14cbcSMatt Macy 1192eda14cbcSMatt Macy for (C = *p++; C > 0; C--) { 1193eda14cbcSMatt Macy for (k = 0; k < size; p++, k++) 1194eda14cbcSMatt Macy if (*p != q[k]) 1195eda14cbcSMatt Macy break; 1196eda14cbcSMatt Macy 1197eda14cbcSMatt Macy /* Have we found it? */ 1198eda14cbcSMatt Macy if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { 1199eda14cbcSMatt Macy match_not_found = B_FALSE; 1200eda14cbcSMatt Macy 1201eda14cbcSMatt Macy l = saved_l; 1202eda14cbcSMatt Macy 1203eda14cbcSMatt Macy while (*++p != U8_TBL_ELEMENT_FILLER) 1204eda14cbcSMatt Macy t[l++] = *p; 1205eda14cbcSMatt Macy 1206eda14cbcSMatt Macy break; 1207eda14cbcSMatt Macy } 1208eda14cbcSMatt Macy 1209eda14cbcSMatt Macy /* We didn't find; skip to the next pair. */ 1210eda14cbcSMatt Macy if (*p != U8_TBL_ELEMENT_FILLER) 1211eda14cbcSMatt Macy while (*++p != U8_TBL_ELEMENT_FILLER) 1212eda14cbcSMatt Macy ; 1213eda14cbcSMatt Macy while (*++p != U8_TBL_ELEMENT_FILLER) 1214eda14cbcSMatt Macy ; 1215eda14cbcSMatt Macy p++; 1216eda14cbcSMatt Macy } 1217eda14cbcSMatt Macy 1218eda14cbcSMatt Macy /* 1219eda14cbcSMatt Macy * If there was no match, we will need to save the combining 1220eda14cbcSMatt Macy * mark for later appending. After that, if the next one 1221eda14cbcSMatt Macy * is a non-Starter and not blocked, then, we try once 1222eda14cbcSMatt Macy * again to do composition with the next non-Starter. 1223eda14cbcSMatt Macy * 1224eda14cbcSMatt Macy * If there was no match and this was a Starter, then, 1225eda14cbcSMatt Macy * this is a new start. 1226eda14cbcSMatt Macy * 1227eda14cbcSMatt Macy * If there was a match and a composition done and we have 1228eda14cbcSMatt Macy * more to check on, then, we retrieve a new composition final 1229eda14cbcSMatt Macy * table entry for the composite and then try to do the 1230eda14cbcSMatt Macy * composition again. 1231eda14cbcSMatt Macy */ 1232eda14cbcSMatt Macy 1233eda14cbcSMatt Macy if (match_not_found) { 1234eda14cbcSMatt Macy if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { 1235eda14cbcSMatt Macy i--; 1236eda14cbcSMatt Macy goto SAVE_THE_CHAR; 1237eda14cbcSMatt Macy } 1238eda14cbcSMatt Macy 1239eda14cbcSMatt Macy saved_marks[saved_marks_count++] = i; 1240eda14cbcSMatt Macy } 1241eda14cbcSMatt Macy 1242eda14cbcSMatt Macy if (saved_l == l) { 1243eda14cbcSMatt Macy while (i < last) { 1244eda14cbcSMatt Macy if (blocked(comb_class, i + 1)) 1245eda14cbcSMatt Macy saved_marks[saved_marks_count++] = ++i; 1246eda14cbcSMatt Macy else 1247eda14cbcSMatt Macy break; 1248eda14cbcSMatt Macy } 1249eda14cbcSMatt Macy if (i < last) { 1250eda14cbcSMatt Macy p = saved_p; 1251eda14cbcSMatt Macy goto TRY_THE_NEXT_MARK; 1252eda14cbcSMatt Macy } 1253eda14cbcSMatt Macy } else if (i < last) { 1254eda14cbcSMatt Macy p = find_composition_start(uv, t + saved_l, 1255eda14cbcSMatt Macy l - saved_l); 1256eda14cbcSMatt Macy if (p != NULL) { 1257eda14cbcSMatt Macy saved_p = p; 1258eda14cbcSMatt Macy goto TRY_THE_NEXT_MARK; 1259eda14cbcSMatt Macy } 1260eda14cbcSMatt Macy } 1261eda14cbcSMatt Macy 1262eda14cbcSMatt Macy /* 1263eda14cbcSMatt Macy * There is no more composition possible. 1264eda14cbcSMatt Macy * 1265eda14cbcSMatt Macy * If there was no composition what so ever then we copy 1266eda14cbcSMatt Macy * over the original Starter and then append any non-Starters 1267eda14cbcSMatt Macy * remaining at the target string sequentially after that. 1268eda14cbcSMatt Macy */ 1269eda14cbcSMatt Macy 1270eda14cbcSMatt Macy if (saved_l == l) { 1271eda14cbcSMatt Macy p = s + start[saved_i]; 1272eda14cbcSMatt Macy size = disp[saved_i]; 1273eda14cbcSMatt Macy for (j = 0; j < size; j++) 1274eda14cbcSMatt Macy t[l++] = *p++; 1275eda14cbcSMatt Macy } 1276eda14cbcSMatt Macy 1277eda14cbcSMatt Macy for (k = 0; k < saved_marks_count; k++) { 1278eda14cbcSMatt Macy p = s + start[saved_marks[k]]; 1279eda14cbcSMatt Macy size = disp[saved_marks[k]]; 1280eda14cbcSMatt Macy for (j = 0; j < size; j++) 1281eda14cbcSMatt Macy t[l++] = *p++; 1282eda14cbcSMatt Macy } 1283eda14cbcSMatt Macy } 1284eda14cbcSMatt Macy 1285eda14cbcSMatt Macy /* 1286eda14cbcSMatt Macy * If the last character is a Starter and if we have a character 1287eda14cbcSMatt Macy * (possibly another Starter) that can be turned into a composite, 1288eda14cbcSMatt Macy * we do so and we do so until there is no more of composition 1289eda14cbcSMatt Macy * possible. 1290eda14cbcSMatt Macy */ 1291eda14cbcSMatt Macy if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { 1292eda14cbcSMatt Macy p = *os; 1293eda14cbcSMatt Macy saved_l = l - disp[last]; 1294eda14cbcSMatt Macy 1295eda14cbcSMatt Macy while (p < oslast) { 129615f0b8c3SMartin Matuska int8_t number_of_bytes = u8_number_of_bytes[*p]; 129715f0b8c3SMartin Matuska 129815f0b8c3SMartin Matuska if (number_of_bytes <= 1) 129915f0b8c3SMartin Matuska break; 130015f0b8c3SMartin Matuska size = number_of_bytes; 130115f0b8c3SMartin Matuska if ((p + size) > oslast) 1302eda14cbcSMatt Macy break; 1303eda14cbcSMatt Macy 1304eda14cbcSMatt Macy saved_p = p; 1305eda14cbcSMatt Macy 1306eda14cbcSMatt Macy for (i = 0; i < size; i++) 1307eda14cbcSMatt Macy tc[i] = *p++; 1308eda14cbcSMatt Macy 1309eda14cbcSMatt Macy q = find_composition_start(uv, t + saved_l, 1310eda14cbcSMatt Macy l - saved_l); 1311eda14cbcSMatt Macy if (q == NULL) { 1312eda14cbcSMatt Macy p = saved_p; 1313eda14cbcSMatt Macy break; 1314eda14cbcSMatt Macy } 1315eda14cbcSMatt Macy 1316eda14cbcSMatt Macy match_not_found = B_TRUE; 1317eda14cbcSMatt Macy 1318eda14cbcSMatt Macy for (C = *q++; C > 0; C--) { 1319eda14cbcSMatt Macy for (k = 0; k < size; q++, k++) 1320eda14cbcSMatt Macy if (*q != tc[k]) 1321eda14cbcSMatt Macy break; 1322eda14cbcSMatt Macy 1323eda14cbcSMatt Macy if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { 1324eda14cbcSMatt Macy match_not_found = B_FALSE; 1325eda14cbcSMatt Macy 1326eda14cbcSMatt Macy l = saved_l; 1327eda14cbcSMatt Macy 1328eda14cbcSMatt Macy while (*++q != U8_TBL_ELEMENT_FILLER) { 1329eda14cbcSMatt Macy /* 1330eda14cbcSMatt Macy * This is practically 1331eda14cbcSMatt Macy * impossible but we don't 1332eda14cbcSMatt Macy * want to take any chances. 1333eda14cbcSMatt Macy */ 1334eda14cbcSMatt Macy if (l >= 1335eda14cbcSMatt Macy U8_STREAM_SAFE_TEXT_MAX) { 1336eda14cbcSMatt Macy p = saved_p; 1337eda14cbcSMatt Macy goto SAFE_RETURN; 1338eda14cbcSMatt Macy } 1339eda14cbcSMatt Macy t[l++] = *q; 1340eda14cbcSMatt Macy } 1341eda14cbcSMatt Macy 1342eda14cbcSMatt Macy break; 1343eda14cbcSMatt Macy } 1344eda14cbcSMatt Macy 1345eda14cbcSMatt Macy if (*q != U8_TBL_ELEMENT_FILLER) 1346eda14cbcSMatt Macy while (*++q != U8_TBL_ELEMENT_FILLER) 1347eda14cbcSMatt Macy ; 1348eda14cbcSMatt Macy while (*++q != U8_TBL_ELEMENT_FILLER) 1349eda14cbcSMatt Macy ; 1350eda14cbcSMatt Macy q++; 1351eda14cbcSMatt Macy } 1352eda14cbcSMatt Macy 1353eda14cbcSMatt Macy if (match_not_found) { 1354eda14cbcSMatt Macy p = saved_p; 1355eda14cbcSMatt Macy break; 1356eda14cbcSMatt Macy } 1357eda14cbcSMatt Macy } 1358eda14cbcSMatt Macy SAFE_RETURN: 1359eda14cbcSMatt Macy *os = p; 1360eda14cbcSMatt Macy } 1361eda14cbcSMatt Macy 1362eda14cbcSMatt Macy /* 1363eda14cbcSMatt Macy * Now we copy over the temporary string to the target string. 1364eda14cbcSMatt Macy * Since composition always reduces the number of characters or 1365eda14cbcSMatt Macy * the number of characters stay, we don't need to worry about 1366eda14cbcSMatt Macy * the buffer overflow here. 1367eda14cbcSMatt Macy */ 1368eda14cbcSMatt Macy for (i = 0; i < l; i++) 1369eda14cbcSMatt Macy s[i] = t[i]; 1370eda14cbcSMatt Macy s[l] = '\0'; 1371eda14cbcSMatt Macy 1372eda14cbcSMatt Macy return (l); 1373eda14cbcSMatt Macy } 1374eda14cbcSMatt Macy 1375eda14cbcSMatt Macy /* 1376eda14cbcSMatt Macy * The collect_a_seq() function checks on the given string s, collect 1377eda14cbcSMatt Macy * a sequence of characters at u8s, and return the sequence. While it collects 1378eda14cbcSMatt Macy * a sequence, it also applies case conversion, canonical or compatibility 1379eda14cbcSMatt Macy * decomposition, canonical decomposition, or some or all of them and 1380eda14cbcSMatt Macy * in that order. 1381eda14cbcSMatt Macy * 1382eda14cbcSMatt Macy * The collected sequence cannot be bigger than 32 characters since if 1383eda14cbcSMatt Macy * it is having more than 31 characters, the sequence will be terminated 1384eda14cbcSMatt Macy * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into 1385eda14cbcSMatt Macy * a Stream-Safe Text. The collected sequence is always terminated with 1386eda14cbcSMatt Macy * a null byte and the return value is the byte length of the sequence 1387eda14cbcSMatt Macy * including 0. The return value does not include the terminating 1388eda14cbcSMatt Macy * null byte. 1389eda14cbcSMatt Macy */ 1390eda14cbcSMatt Macy static size_t 1391eda14cbcSMatt Macy collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, 139215f0b8c3SMartin Matuska boolean_t is_it_toupper, 139315f0b8c3SMartin Matuska boolean_t is_it_tolower, 139415f0b8c3SMartin Matuska boolean_t canonical_decomposition, 139515f0b8c3SMartin Matuska boolean_t compatibility_decomposition, 1396eda14cbcSMatt Macy boolean_t canonical_composition, 1397eda14cbcSMatt Macy int *errnum, u8_normalization_states_t *state) 1398eda14cbcSMatt Macy { 1399eda14cbcSMatt Macy uchar_t *s; 1400eda14cbcSMatt Macy int sz; 1401eda14cbcSMatt Macy int saved_sz; 1402eda14cbcSMatt Macy size_t i; 1403eda14cbcSMatt Macy size_t j; 1404eda14cbcSMatt Macy size_t k; 1405eda14cbcSMatt Macy size_t l; 1406eda14cbcSMatt Macy uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; 1407eda14cbcSMatt Macy uchar_t disp[U8_MAX_CHARS_A_SEQ]; 1408eda14cbcSMatt Macy uchar_t start[U8_MAX_CHARS_A_SEQ]; 1409eda14cbcSMatt Macy uchar_t u8t[U8_MB_CUR_MAX] = { '\0' }; 1410eda14cbcSMatt Macy uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; 1411eda14cbcSMatt Macy uchar_t tc; 1412eda14cbcSMatt Macy size_t last; 1413eda14cbcSMatt Macy size_t saved_last; 1414eda14cbcSMatt Macy uint32_t u1; 1415eda14cbcSMatt Macy 1416eda14cbcSMatt Macy /* 1417eda14cbcSMatt Macy * Save the source string pointer which we will return a changed 1418eda14cbcSMatt Macy * pointer if we do processing. 1419eda14cbcSMatt Macy */ 1420eda14cbcSMatt Macy s = *source; 1421eda14cbcSMatt Macy 1422eda14cbcSMatt Macy /* 1423eda14cbcSMatt Macy * The following is a fallback for just in case callers are not 1424eda14cbcSMatt Macy * checking the string boundaries before the calling. 1425eda14cbcSMatt Macy */ 1426eda14cbcSMatt Macy if (s >= slast) { 1427eda14cbcSMatt Macy u8s[0] = '\0'; 1428eda14cbcSMatt Macy 1429eda14cbcSMatt Macy return (0); 1430eda14cbcSMatt Macy } 1431eda14cbcSMatt Macy 1432eda14cbcSMatt Macy /* 1433eda14cbcSMatt Macy * As the first thing, let's collect a character and do case 1434eda14cbcSMatt Macy * conversion if necessary. 1435eda14cbcSMatt Macy */ 1436eda14cbcSMatt Macy 1437eda14cbcSMatt Macy sz = u8_number_of_bytes[*s]; 1438eda14cbcSMatt Macy 1439eda14cbcSMatt Macy if (sz < 0) { 1440eda14cbcSMatt Macy *errnum = EILSEQ; 1441eda14cbcSMatt Macy 1442eda14cbcSMatt Macy u8s[0] = *s++; 1443eda14cbcSMatt Macy u8s[1] = '\0'; 1444eda14cbcSMatt Macy 1445eda14cbcSMatt Macy *source = s; 1446eda14cbcSMatt Macy 1447eda14cbcSMatt Macy return (1); 1448eda14cbcSMatt Macy } 1449eda14cbcSMatt Macy 1450eda14cbcSMatt Macy if (sz == 1) { 1451eda14cbcSMatt Macy if (is_it_toupper) 1452eda14cbcSMatt Macy u8s[0] = U8_ASCII_TOUPPER(*s); 1453eda14cbcSMatt Macy else if (is_it_tolower) 1454eda14cbcSMatt Macy u8s[0] = U8_ASCII_TOLOWER(*s); 1455eda14cbcSMatt Macy else 1456eda14cbcSMatt Macy u8s[0] = *s; 1457eda14cbcSMatt Macy s++; 1458eda14cbcSMatt Macy u8s[1] = '\0'; 1459eda14cbcSMatt Macy } else if ((s + sz) > slast) { 1460eda14cbcSMatt Macy *errnum = EINVAL; 1461eda14cbcSMatt Macy 1462eda14cbcSMatt Macy for (i = 0; s < slast; ) 1463eda14cbcSMatt Macy u8s[i++] = *s++; 1464eda14cbcSMatt Macy u8s[i] = '\0'; 1465eda14cbcSMatt Macy 1466eda14cbcSMatt Macy *source = s; 1467eda14cbcSMatt Macy 1468eda14cbcSMatt Macy return (i); 1469eda14cbcSMatt Macy } else { 1470eda14cbcSMatt Macy if (is_it_toupper || is_it_tolower) { 1471eda14cbcSMatt Macy i = do_case_conv(uv, u8s, s, sz, is_it_toupper); 1472eda14cbcSMatt Macy s += sz; 1473eda14cbcSMatt Macy sz = i; 1474eda14cbcSMatt Macy } else { 1475eda14cbcSMatt Macy for (i = 0; i < sz; ) 1476eda14cbcSMatt Macy u8s[i++] = *s++; 1477eda14cbcSMatt Macy u8s[i] = '\0'; 1478eda14cbcSMatt Macy } 1479eda14cbcSMatt Macy } 1480eda14cbcSMatt Macy 1481eda14cbcSMatt Macy /* 1482eda14cbcSMatt Macy * And then canonical/compatibility decomposition followed by 1483eda14cbcSMatt Macy * an optional canonical composition. Please be noted that 1484eda14cbcSMatt Macy * canonical composition is done only when a decomposition is 1485eda14cbcSMatt Macy * done. 1486eda14cbcSMatt Macy */ 1487eda14cbcSMatt Macy if (canonical_decomposition || compatibility_decomposition) { 1488eda14cbcSMatt Macy if (sz == 1) { 1489eda14cbcSMatt Macy *state = U8_STATE_START; 1490eda14cbcSMatt Macy 1491eda14cbcSMatt Macy saved_sz = 1; 1492eda14cbcSMatt Macy 1493eda14cbcSMatt Macy comb_class[0] = 0; 1494eda14cbcSMatt Macy start[0] = 0; 1495eda14cbcSMatt Macy disp[0] = 1; 1496eda14cbcSMatt Macy 1497eda14cbcSMatt Macy last = 1; 1498eda14cbcSMatt Macy } else { 1499eda14cbcSMatt Macy saved_sz = do_decomp(uv, u8s, u8s, sz, 1500eda14cbcSMatt Macy canonical_decomposition, state); 1501eda14cbcSMatt Macy 1502eda14cbcSMatt Macy last = 0; 1503eda14cbcSMatt Macy 1504eda14cbcSMatt Macy for (i = 0; i < saved_sz; ) { 1505eda14cbcSMatt Macy sz = u8_number_of_bytes[u8s[i]]; 1506eda14cbcSMatt Macy 1507eda14cbcSMatt Macy comb_class[last] = combining_class(uv, 1508eda14cbcSMatt Macy u8s + i, sz); 1509eda14cbcSMatt Macy start[last] = i; 1510eda14cbcSMatt Macy disp[last] = sz; 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy last++; 1513eda14cbcSMatt Macy i += sz; 1514eda14cbcSMatt Macy } 1515eda14cbcSMatt Macy 1516eda14cbcSMatt Macy /* 1517eda14cbcSMatt Macy * Decomposition yields various Hangul related 1518eda14cbcSMatt Macy * states but not on combining marks. We need to 1519eda14cbcSMatt Macy * find out at here by checking on the last 1520eda14cbcSMatt Macy * character. 1521eda14cbcSMatt Macy */ 1522eda14cbcSMatt Macy if (*state == U8_STATE_START) { 1523eda14cbcSMatt Macy if (comb_class[last - 1]) 1524eda14cbcSMatt Macy *state = U8_STATE_COMBINING_MARK; 1525eda14cbcSMatt Macy } 1526eda14cbcSMatt Macy } 1527eda14cbcSMatt Macy 1528eda14cbcSMatt Macy saved_last = last; 1529eda14cbcSMatt Macy 1530eda14cbcSMatt Macy while (s < slast) { 1531eda14cbcSMatt Macy sz = u8_number_of_bytes[*s]; 1532eda14cbcSMatt Macy 1533eda14cbcSMatt Macy /* 1534eda14cbcSMatt Macy * If this is an illegal character, an incomplete 1535eda14cbcSMatt Macy * character, or an 7-bit ASCII Starter character, 1536eda14cbcSMatt Macy * then we have collected a sequence; break and let 1537eda14cbcSMatt Macy * the next call deal with the two cases. 1538eda14cbcSMatt Macy * 1539eda14cbcSMatt Macy * Note that this is okay only if you are using this 1540eda14cbcSMatt Macy * function with a fixed length string, not on 1541eda14cbcSMatt Macy * a buffer with multiple calls of one chunk at a time. 1542eda14cbcSMatt Macy */ 1543eda14cbcSMatt Macy if (sz <= 1) { 1544eda14cbcSMatt Macy break; 1545eda14cbcSMatt Macy } else if ((s + sz) > slast) { 1546eda14cbcSMatt Macy break; 1547eda14cbcSMatt Macy } else { 1548eda14cbcSMatt Macy /* 1549eda14cbcSMatt Macy * If the previous character was a Hangul Jamo 1550eda14cbcSMatt Macy * and this character is a Hangul Jamo that 1551eda14cbcSMatt Macy * can be conjoined, we collect the Jamo. 1552eda14cbcSMatt Macy */ 1553eda14cbcSMatt Macy if (*s == U8_HANGUL_JAMO_1ST_BYTE) { 1554eda14cbcSMatt Macy U8_PUT_3BYTES_INTO_UTF32(u1, 1555eda14cbcSMatt Macy *s, *(s + 1), *(s + 2)); 1556eda14cbcSMatt Macy 1557eda14cbcSMatt Macy if (U8_HANGUL_COMPOSABLE_L_V(*state, 1558eda14cbcSMatt Macy u1)) { 1559eda14cbcSMatt Macy i = 0; 1560eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LV; 1561eda14cbcSMatt Macy goto COLLECT_A_HANGUL; 1562eda14cbcSMatt Macy } 1563eda14cbcSMatt Macy 1564eda14cbcSMatt Macy if (U8_HANGUL_COMPOSABLE_LV_T(*state, 1565eda14cbcSMatt Macy u1)) { 1566eda14cbcSMatt Macy i = 0; 1567eda14cbcSMatt Macy *state = U8_STATE_HANGUL_LVT; 1568eda14cbcSMatt Macy goto COLLECT_A_HANGUL; 1569eda14cbcSMatt Macy } 1570eda14cbcSMatt Macy } 1571eda14cbcSMatt Macy 1572eda14cbcSMatt Macy /* 1573eda14cbcSMatt Macy * Regardless of whatever it was, if this is 1574eda14cbcSMatt Macy * a Starter, we don't collect the character 1575eda14cbcSMatt Macy * since that's a new start and we will deal 1576eda14cbcSMatt Macy * with it at the next time. 1577eda14cbcSMatt Macy */ 1578eda14cbcSMatt Macy i = combining_class(uv, s, sz); 1579eda14cbcSMatt Macy if (i == U8_COMBINING_CLASS_STARTER) 1580eda14cbcSMatt Macy break; 1581eda14cbcSMatt Macy 1582eda14cbcSMatt Macy /* 1583eda14cbcSMatt Macy * We know the current character is a combining 1584eda14cbcSMatt Macy * mark. If the previous character wasn't 1585eda14cbcSMatt Macy * a Starter (not Hangul) or a combining mark, 1586eda14cbcSMatt Macy * then, we don't collect this combining mark. 1587eda14cbcSMatt Macy */ 1588eda14cbcSMatt Macy if (*state != U8_STATE_START && 1589eda14cbcSMatt Macy *state != U8_STATE_COMBINING_MARK) 1590eda14cbcSMatt Macy break; 1591eda14cbcSMatt Macy 1592eda14cbcSMatt Macy *state = U8_STATE_COMBINING_MARK; 1593eda14cbcSMatt Macy COLLECT_A_HANGUL: 1594eda14cbcSMatt Macy /* 1595eda14cbcSMatt Macy * If we collected a Starter and combining 1596eda14cbcSMatt Macy * marks up to 30, i.e., total 31 characters, 1597eda14cbcSMatt Macy * then, we terminate this degenerately long 1598eda14cbcSMatt Macy * combining sequence with a U+034F COMBINING 1599eda14cbcSMatt Macy * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in 1600eda14cbcSMatt Macy * UTF-8 and turn this into a Stream-Safe 1601eda14cbcSMatt Macy * Text. This will be extremely rare but 1602eda14cbcSMatt Macy * possible. 1603eda14cbcSMatt Macy * 1604eda14cbcSMatt Macy * The following will also guarantee that 1605eda14cbcSMatt Macy * we are not writing more than 32 characters 1606eda14cbcSMatt Macy * plus a NULL at u8s[]. 1607eda14cbcSMatt Macy */ 1608eda14cbcSMatt Macy if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { 1609eda14cbcSMatt Macy TURN_STREAM_SAFE: 1610eda14cbcSMatt Macy *state = U8_STATE_START; 1611eda14cbcSMatt Macy comb_class[last] = 0; 1612eda14cbcSMatt Macy start[last] = saved_sz; 1613eda14cbcSMatt Macy disp[last] = 2; 1614eda14cbcSMatt Macy last++; 1615eda14cbcSMatt Macy 1616eda14cbcSMatt Macy u8s[saved_sz++] = 0xCD; 1617eda14cbcSMatt Macy u8s[saved_sz++] = 0x8F; 1618eda14cbcSMatt Macy 1619eda14cbcSMatt Macy break; 1620eda14cbcSMatt Macy } 1621eda14cbcSMatt Macy 1622eda14cbcSMatt Macy /* 1623eda14cbcSMatt Macy * Some combining marks also do decompose into 1624eda14cbcSMatt Macy * another combining mark or marks. 1625eda14cbcSMatt Macy */ 1626eda14cbcSMatt Macy if (*state == U8_STATE_COMBINING_MARK) { 1627eda14cbcSMatt Macy k = last; 1628eda14cbcSMatt Macy l = sz; 1629eda14cbcSMatt Macy i = do_decomp(uv, uts, s, sz, 1630eda14cbcSMatt Macy canonical_decomposition, state); 1631eda14cbcSMatt Macy for (j = 0; j < i; ) { 1632eda14cbcSMatt Macy sz = u8_number_of_bytes[uts[j]]; 1633eda14cbcSMatt Macy 1634eda14cbcSMatt Macy comb_class[last] = 1635eda14cbcSMatt Macy combining_class(uv, 1636eda14cbcSMatt Macy uts + j, sz); 1637eda14cbcSMatt Macy start[last] = saved_sz + j; 1638eda14cbcSMatt Macy disp[last] = sz; 1639eda14cbcSMatt Macy 1640eda14cbcSMatt Macy last++; 1641eda14cbcSMatt Macy if (last >= 1642eda14cbcSMatt Macy U8_UPPER_LIMIT_IN_A_SEQ) { 1643eda14cbcSMatt Macy last = k; 1644eda14cbcSMatt Macy goto TURN_STREAM_SAFE; 1645eda14cbcSMatt Macy } 1646eda14cbcSMatt Macy j += sz; 1647eda14cbcSMatt Macy } 1648eda14cbcSMatt Macy 1649eda14cbcSMatt Macy *state = U8_STATE_COMBINING_MARK; 1650eda14cbcSMatt Macy sz = i; 1651eda14cbcSMatt Macy s += l; 1652eda14cbcSMatt Macy 1653eda14cbcSMatt Macy for (i = 0; i < sz; i++) 1654eda14cbcSMatt Macy u8s[saved_sz++] = uts[i]; 1655eda14cbcSMatt Macy } else { 1656eda14cbcSMatt Macy comb_class[last] = i; 1657eda14cbcSMatt Macy start[last] = saved_sz; 1658eda14cbcSMatt Macy disp[last] = sz; 1659eda14cbcSMatt Macy last++; 1660eda14cbcSMatt Macy 1661eda14cbcSMatt Macy for (i = 0; i < sz; i++) 1662eda14cbcSMatt Macy u8s[saved_sz++] = *s++; 1663eda14cbcSMatt Macy } 1664eda14cbcSMatt Macy 1665eda14cbcSMatt Macy /* 1666eda14cbcSMatt Macy * If this is U+0345 COMBINING GREEK 1667eda14cbcSMatt Macy * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., 1668eda14cbcSMatt Macy * iota subscript, and need to be converted to 1669eda14cbcSMatt Macy * uppercase letter, convert it to U+0399 GREEK 1670eda14cbcSMatt Macy * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), 1671eda14cbcSMatt Macy * i.e., convert to capital adscript form as 1672eda14cbcSMatt Macy * specified in the Unicode standard. 1673eda14cbcSMatt Macy * 1674eda14cbcSMatt Macy * This is the only special case of (ambiguous) 1675eda14cbcSMatt Macy * case conversion at combining marks and 1676eda14cbcSMatt Macy * probably the standard will never have 1677eda14cbcSMatt Macy * anything similar like this in future. 1678eda14cbcSMatt Macy */ 1679eda14cbcSMatt Macy if (is_it_toupper && sz >= 2 && 1680eda14cbcSMatt Macy u8s[saved_sz - 2] == 0xCD && 1681eda14cbcSMatt Macy u8s[saved_sz - 1] == 0x85) { 1682eda14cbcSMatt Macy u8s[saved_sz - 2] = 0xCE; 1683eda14cbcSMatt Macy u8s[saved_sz - 1] = 0x99; 1684eda14cbcSMatt Macy } 1685eda14cbcSMatt Macy } 1686eda14cbcSMatt Macy } 1687eda14cbcSMatt Macy 1688eda14cbcSMatt Macy /* 1689eda14cbcSMatt Macy * Let's try to ensure a canonical ordering for the collected 1690eda14cbcSMatt Macy * combining marks. We do this only if we have collected 1691eda14cbcSMatt Macy * at least one more non-Starter. (The decomposition mapping 1692eda14cbcSMatt Macy * data tables have fully (and recursively) expanded and 1693eda14cbcSMatt Macy * canonically ordered decompositions.) 1694eda14cbcSMatt Macy * 1695eda14cbcSMatt Macy * The U8_SWAP_COMB_MARKS() convenience macro has some 1696eda14cbcSMatt Macy * assumptions and we are meeting the assumptions. 1697eda14cbcSMatt Macy */ 1698eda14cbcSMatt Macy last--; 1699eda14cbcSMatt Macy if (last >= saved_last) { 1700eda14cbcSMatt Macy for (i = 0; i < last; i++) 1701eda14cbcSMatt Macy for (j = last; j > i; j--) 1702eda14cbcSMatt Macy if (comb_class[j] && 1703eda14cbcSMatt Macy comb_class[j - 1] > comb_class[j]) { 1704eda14cbcSMatt Macy U8_SWAP_COMB_MARKS(j - 1, j); 1705eda14cbcSMatt Macy } 1706eda14cbcSMatt Macy } 1707eda14cbcSMatt Macy 1708eda14cbcSMatt Macy *source = s; 1709eda14cbcSMatt Macy 1710eda14cbcSMatt Macy if (! canonical_composition) { 1711eda14cbcSMatt Macy u8s[saved_sz] = '\0'; 1712eda14cbcSMatt Macy return (saved_sz); 1713eda14cbcSMatt Macy } 1714eda14cbcSMatt Macy 1715eda14cbcSMatt Macy /* 1716eda14cbcSMatt Macy * Now do the canonical composition. Note that we do this 1717eda14cbcSMatt Macy * only after a canonical or compatibility decomposition to 1718eda14cbcSMatt Macy * finish up NFC or NFKC. 1719eda14cbcSMatt Macy */ 1720eda14cbcSMatt Macy sz = do_composition(uv, u8s, comb_class, start, disp, last, 1721eda14cbcSMatt Macy &s, slast); 1722eda14cbcSMatt Macy } 1723eda14cbcSMatt Macy 1724eda14cbcSMatt Macy *source = s; 1725eda14cbcSMatt Macy 1726eda14cbcSMatt Macy return ((size_t)sz); 1727eda14cbcSMatt Macy } 1728eda14cbcSMatt Macy 1729eda14cbcSMatt Macy /* 1730eda14cbcSMatt Macy * The do_norm_compare() function does string comparison based on Unicode 1731eda14cbcSMatt Macy * simple case mappings and Unicode Normalization definitions. 1732eda14cbcSMatt Macy * 1733eda14cbcSMatt Macy * It does so by collecting a sequence of character at a time and comparing 1734eda14cbcSMatt Macy * the collected sequences from the strings. 1735eda14cbcSMatt Macy * 1736eda14cbcSMatt Macy * The meanings on the return values are the same as the usual strcmp(). 1737eda14cbcSMatt Macy */ 1738eda14cbcSMatt Macy static int 1739eda14cbcSMatt Macy do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, 1740eda14cbcSMatt Macy int flag, int *errnum) 1741eda14cbcSMatt Macy { 1742eda14cbcSMatt Macy int result; 1743eda14cbcSMatt Macy size_t sz1; 1744eda14cbcSMatt Macy size_t sz2; 1745eda14cbcSMatt Macy uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; 1746eda14cbcSMatt Macy uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; 1747eda14cbcSMatt Macy uchar_t *s1last; 1748eda14cbcSMatt Macy uchar_t *s2last; 1749eda14cbcSMatt Macy boolean_t is_it_toupper; 1750eda14cbcSMatt Macy boolean_t is_it_tolower; 1751eda14cbcSMatt Macy boolean_t canonical_decomposition; 1752eda14cbcSMatt Macy boolean_t compatibility_decomposition; 1753eda14cbcSMatt Macy boolean_t canonical_composition; 1754eda14cbcSMatt Macy u8_normalization_states_t state; 1755eda14cbcSMatt Macy 1756eda14cbcSMatt Macy s1last = s1 + n1; 1757eda14cbcSMatt Macy s2last = s2 + n2; 1758eda14cbcSMatt Macy 1759eda14cbcSMatt Macy is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 1760*5c65a0a9SMartin Matuska #ifdef U8_STRCMP_CI_LOWER 1761eda14cbcSMatt Macy is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 1762*5c65a0a9SMartin Matuska #else 1763*5c65a0a9SMartin Matuska is_it_tolower = 0; 1764*5c65a0a9SMartin Matuska #endif 1765eda14cbcSMatt Macy canonical_decomposition = flag & U8_CANON_DECOMP; 1766eda14cbcSMatt Macy compatibility_decomposition = flag & U8_COMPAT_DECOMP; 1767eda14cbcSMatt Macy canonical_composition = flag & U8_CANON_COMP; 1768eda14cbcSMatt Macy 1769eda14cbcSMatt Macy while (s1 < s1last && s2 < s2last) { 1770eda14cbcSMatt Macy /* 1771eda14cbcSMatt Macy * If the current character is a 7-bit ASCII and the last 1772eda14cbcSMatt Macy * character, or, if the current character and the next 1773eda14cbcSMatt Macy * character are both some 7-bit ASCII characters then 1774eda14cbcSMatt Macy * we treat the current character as a sequence. 1775eda14cbcSMatt Macy * 1776eda14cbcSMatt Macy * In any other cases, we need to call collect_a_seq(). 1777eda14cbcSMatt Macy */ 1778eda14cbcSMatt Macy 1779eda14cbcSMatt Macy if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || 1780eda14cbcSMatt Macy ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { 1781eda14cbcSMatt Macy if (is_it_toupper) 1782eda14cbcSMatt Macy u8s1[0] = U8_ASCII_TOUPPER(*s1); 1783eda14cbcSMatt Macy else if (is_it_tolower) 1784eda14cbcSMatt Macy u8s1[0] = U8_ASCII_TOLOWER(*s1); 1785eda14cbcSMatt Macy else 1786eda14cbcSMatt Macy u8s1[0] = *s1; 1787eda14cbcSMatt Macy u8s1[1] = '\0'; 1788eda14cbcSMatt Macy sz1 = 1; 1789eda14cbcSMatt Macy s1++; 1790eda14cbcSMatt Macy } else { 1791eda14cbcSMatt Macy state = U8_STATE_START; 1792eda14cbcSMatt Macy sz1 = collect_a_seq(uv, u8s1, &s1, s1last, 1793eda14cbcSMatt Macy is_it_toupper, is_it_tolower, 1794eda14cbcSMatt Macy canonical_decomposition, 1795eda14cbcSMatt Macy compatibility_decomposition, 1796eda14cbcSMatt Macy canonical_composition, errnum, &state); 1797eda14cbcSMatt Macy } 1798eda14cbcSMatt Macy 1799eda14cbcSMatt Macy if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || 1800eda14cbcSMatt Macy ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { 1801eda14cbcSMatt Macy if (is_it_toupper) 1802eda14cbcSMatt Macy u8s2[0] = U8_ASCII_TOUPPER(*s2); 1803eda14cbcSMatt Macy else if (is_it_tolower) 1804eda14cbcSMatt Macy u8s2[0] = U8_ASCII_TOLOWER(*s2); 1805eda14cbcSMatt Macy else 1806eda14cbcSMatt Macy u8s2[0] = *s2; 1807eda14cbcSMatt Macy u8s2[1] = '\0'; 1808eda14cbcSMatt Macy sz2 = 1; 1809eda14cbcSMatt Macy s2++; 1810eda14cbcSMatt Macy } else { 1811eda14cbcSMatt Macy state = U8_STATE_START; 1812eda14cbcSMatt Macy sz2 = collect_a_seq(uv, u8s2, &s2, s2last, 1813eda14cbcSMatt Macy is_it_toupper, is_it_tolower, 1814eda14cbcSMatt Macy canonical_decomposition, 1815eda14cbcSMatt Macy compatibility_decomposition, 1816eda14cbcSMatt Macy canonical_composition, errnum, &state); 1817eda14cbcSMatt Macy } 1818eda14cbcSMatt Macy 1819eda14cbcSMatt Macy /* 1820eda14cbcSMatt Macy * Now compare the two characters. If they are the same, 1821eda14cbcSMatt Macy * we move on to the next character sequences. 1822eda14cbcSMatt Macy */ 1823eda14cbcSMatt Macy if (sz1 == 1 && sz2 == 1) { 1824eda14cbcSMatt Macy if (*u8s1 > *u8s2) 1825eda14cbcSMatt Macy return (1); 1826eda14cbcSMatt Macy if (*u8s1 < *u8s2) 1827eda14cbcSMatt Macy return (-1); 1828eda14cbcSMatt Macy } else { 1829eda14cbcSMatt Macy result = strcmp((const char *)u8s1, (const char *)u8s2); 1830eda14cbcSMatt Macy if (result != 0) 1831eda14cbcSMatt Macy return (result); 1832eda14cbcSMatt Macy } 1833eda14cbcSMatt Macy } 1834eda14cbcSMatt Macy 1835eda14cbcSMatt Macy /* 1836eda14cbcSMatt Macy * We compared until the end of either or both strings. 1837eda14cbcSMatt Macy * 1838eda14cbcSMatt Macy * If we reached to or went over the ends for the both, that means 1839eda14cbcSMatt Macy * they are the same. 1840eda14cbcSMatt Macy * 1841eda14cbcSMatt Macy * If we reached only one end, that means the other string has 1842eda14cbcSMatt Macy * something which then can be used to determine the return value. 1843eda14cbcSMatt Macy */ 1844eda14cbcSMatt Macy if (s1 >= s1last) { 1845eda14cbcSMatt Macy if (s2 >= s2last) 1846eda14cbcSMatt Macy return (0); 1847eda14cbcSMatt Macy return (-1); 1848eda14cbcSMatt Macy } 1849eda14cbcSMatt Macy return (1); 1850eda14cbcSMatt Macy } 1851eda14cbcSMatt Macy 1852eda14cbcSMatt Macy /* 1853eda14cbcSMatt Macy * The u8_strcmp() function compares two UTF-8 strings quite similar to 1854eda14cbcSMatt Macy * the strcmp(). For the comparison, however, Unicode Normalization specific 1855eda14cbcSMatt Macy * equivalency and Unicode simple case conversion mappings based equivalency 1856eda14cbcSMatt Macy * can be requested and checked against. 1857eda14cbcSMatt Macy */ 1858eda14cbcSMatt Macy int 1859eda14cbcSMatt Macy u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, 1860eda14cbcSMatt Macy int *errnum) 1861eda14cbcSMatt Macy { 1862eda14cbcSMatt Macy int f; 1863eda14cbcSMatt Macy size_t n1; 1864eda14cbcSMatt Macy size_t n2; 1865eda14cbcSMatt Macy 1866eda14cbcSMatt Macy *errnum = 0; 1867eda14cbcSMatt Macy 1868eda14cbcSMatt Macy /* 1869eda14cbcSMatt Macy * Check on the requested Unicode version, case conversion, and 1870eda14cbcSMatt Macy * normalization flag values. 1871eda14cbcSMatt Macy */ 1872eda14cbcSMatt Macy 1873eda14cbcSMatt Macy if (uv > U8_UNICODE_LATEST) { 1874eda14cbcSMatt Macy *errnum = ERANGE; 1875eda14cbcSMatt Macy uv = U8_UNICODE_LATEST; 1876eda14cbcSMatt Macy } 1877eda14cbcSMatt Macy 1878eda14cbcSMatt Macy if (flag == 0) { 1879eda14cbcSMatt Macy flag = U8_STRCMP_CS; 1880eda14cbcSMatt Macy } else { 1881*5c65a0a9SMartin Matuska #ifdef U8_STRCMP_CI_LOWER 1882*5c65a0a9SMartin Matuska f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER 1883*5c65a0a9SMartin Matuska | U8_STRCMP_CI_LOWER); 1884*5c65a0a9SMartin Matuska #else 1885*5c65a0a9SMartin Matuska f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER); 1886*5c65a0a9SMartin Matuska #endif 1887eda14cbcSMatt Macy if (f == 0) { 1888eda14cbcSMatt Macy flag |= U8_STRCMP_CS; 1889*5c65a0a9SMartin Matuska } 1890*5c65a0a9SMartin Matuska #ifdef U8_STRCMP_CI_LOWER 1891*5c65a0a9SMartin Matuska else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && 1892*5c65a0a9SMartin Matuska f != U8_STRCMP_CI_LOWER) 1893*5c65a0a9SMartin Matuska #else 1894*5c65a0a9SMartin Matuska else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER) 1895*5c65a0a9SMartin Matuska #endif 1896*5c65a0a9SMartin Matuska { 1897eda14cbcSMatt Macy *errnum = EBADF; 1898eda14cbcSMatt Macy flag = U8_STRCMP_CS; 1899eda14cbcSMatt Macy } 1900eda14cbcSMatt Macy 1901eda14cbcSMatt Macy f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1902eda14cbcSMatt Macy if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && 1903eda14cbcSMatt Macy f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { 1904eda14cbcSMatt Macy *errnum = EBADF; 1905eda14cbcSMatt Macy flag = U8_STRCMP_CS; 1906eda14cbcSMatt Macy } 1907eda14cbcSMatt Macy } 1908eda14cbcSMatt Macy 1909eda14cbcSMatt Macy if (flag == U8_STRCMP_CS) { 1910eda14cbcSMatt Macy return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); 1911eda14cbcSMatt Macy } 1912eda14cbcSMatt Macy 1913eda14cbcSMatt Macy n1 = strlen(s1); 1914eda14cbcSMatt Macy n2 = strlen(s2); 1915eda14cbcSMatt Macy if (n != 0) { 1916eda14cbcSMatt Macy if (n < n1) 1917eda14cbcSMatt Macy n1 = n; 1918eda14cbcSMatt Macy if (n < n2) 1919eda14cbcSMatt Macy n2 = n; 1920eda14cbcSMatt Macy } 1921eda14cbcSMatt Macy 1922eda14cbcSMatt Macy /* 1923eda14cbcSMatt Macy * Simple case conversion can be done much faster and so we do 1924eda14cbcSMatt Macy * them separately here. 1925eda14cbcSMatt Macy */ 1926eda14cbcSMatt Macy if (flag == U8_STRCMP_CI_UPPER) { 1927eda14cbcSMatt Macy return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1928eda14cbcSMatt Macy n1, n2, B_TRUE, errnum)); 1929*5c65a0a9SMartin Matuska } 1930*5c65a0a9SMartin Matuska #ifdef U8_STRCMP_CI_LOWER 1931*5c65a0a9SMartin Matuska else if (flag == U8_STRCMP_CI_LOWER) { 1932eda14cbcSMatt Macy return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1933eda14cbcSMatt Macy n1, n2, B_FALSE, errnum)); 1934eda14cbcSMatt Macy } 1935*5c65a0a9SMartin Matuska #endif 1936eda14cbcSMatt Macy 1937eda14cbcSMatt Macy return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, 1938eda14cbcSMatt Macy flag, errnum)); 1939eda14cbcSMatt Macy } 1940eda14cbcSMatt Macy 1941eda14cbcSMatt Macy size_t 1942eda14cbcSMatt Macy u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, 1943eda14cbcSMatt Macy int flag, size_t unicode_version, int *errnum) 1944eda14cbcSMatt Macy { 1945eda14cbcSMatt Macy int f; 1946eda14cbcSMatt Macy int sz; 1947eda14cbcSMatt Macy uchar_t *ib; 1948eda14cbcSMatt Macy uchar_t *ibtail; 1949eda14cbcSMatt Macy uchar_t *ob; 1950eda14cbcSMatt Macy uchar_t *obtail; 1951eda14cbcSMatt Macy boolean_t do_not_ignore_null; 1952eda14cbcSMatt Macy boolean_t do_not_ignore_invalid; 1953eda14cbcSMatt Macy boolean_t is_it_toupper; 1954eda14cbcSMatt Macy boolean_t is_it_tolower; 1955eda14cbcSMatt Macy boolean_t canonical_decomposition; 1956eda14cbcSMatt Macy boolean_t compatibility_decomposition; 1957eda14cbcSMatt Macy boolean_t canonical_composition; 1958eda14cbcSMatt Macy size_t ret_val; 1959eda14cbcSMatt Macy size_t i; 1960eda14cbcSMatt Macy size_t j; 1961eda14cbcSMatt Macy uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; 1962eda14cbcSMatt Macy u8_normalization_states_t state; 1963eda14cbcSMatt Macy 1964eda14cbcSMatt Macy if (unicode_version > U8_UNICODE_LATEST) { 1965eda14cbcSMatt Macy *errnum = ERANGE; 1966eda14cbcSMatt Macy return ((size_t)-1); 1967eda14cbcSMatt Macy } 1968eda14cbcSMatt Macy 1969*5c65a0a9SMartin Matuska #ifdef U8_TEXTPREP_TOLOWER 1970eda14cbcSMatt Macy f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); 1971eda14cbcSMatt Macy if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { 1972eda14cbcSMatt Macy *errnum = EBADF; 1973eda14cbcSMatt Macy return ((size_t)-1); 1974eda14cbcSMatt Macy } 1975*5c65a0a9SMartin Matuska #endif 1976eda14cbcSMatt Macy 1977eda14cbcSMatt Macy f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1978eda14cbcSMatt Macy if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && 1979eda14cbcSMatt Macy f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { 1980eda14cbcSMatt Macy *errnum = EBADF; 1981eda14cbcSMatt Macy return ((size_t)-1); 1982eda14cbcSMatt Macy } 1983eda14cbcSMatt Macy 1984eda14cbcSMatt Macy if (inarray == NULL || *inlen == 0) 1985eda14cbcSMatt Macy return (0); 1986eda14cbcSMatt Macy 1987eda14cbcSMatt Macy if (outarray == NULL) { 1988eda14cbcSMatt Macy *errnum = E2BIG; 1989eda14cbcSMatt Macy return ((size_t)-1); 1990eda14cbcSMatt Macy } 1991eda14cbcSMatt Macy 1992eda14cbcSMatt Macy ib = (uchar_t *)inarray; 1993eda14cbcSMatt Macy ob = (uchar_t *)outarray; 1994eda14cbcSMatt Macy ibtail = ib + *inlen; 1995eda14cbcSMatt Macy obtail = ob + *outlen; 1996eda14cbcSMatt Macy 1997eda14cbcSMatt Macy do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); 1998eda14cbcSMatt Macy do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); 1999eda14cbcSMatt Macy is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 2000*5c65a0a9SMartin Matuska #ifdef U8_TEXTPREP_TOLOWER 2001eda14cbcSMatt Macy is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 2002*5c65a0a9SMartin Matuska #else 2003*5c65a0a9SMartin Matuska is_it_tolower = 0; 2004*5c65a0a9SMartin Matuska #endif 2005eda14cbcSMatt Macy 2006eda14cbcSMatt Macy ret_val = 0; 2007eda14cbcSMatt Macy 2008eda14cbcSMatt Macy /* 2009eda14cbcSMatt Macy * If we don't have a normalization flag set, we do the simple case 2010eda14cbcSMatt Macy * conversion based text preparation separately below. Text 2011eda14cbcSMatt Macy * preparation involving Normalization will be done in the false task 2012eda14cbcSMatt Macy * block, again, separately since it will take much more time and 2013eda14cbcSMatt Macy * resource than doing simple case conversions. 2014eda14cbcSMatt Macy */ 2015eda14cbcSMatt Macy if (f == 0) { 2016eda14cbcSMatt Macy while (ib < ibtail) { 2017eda14cbcSMatt Macy if (*ib == '\0' && do_not_ignore_null) 2018eda14cbcSMatt Macy break; 2019eda14cbcSMatt Macy 2020eda14cbcSMatt Macy sz = u8_number_of_bytes[*ib]; 2021eda14cbcSMatt Macy 2022eda14cbcSMatt Macy if (sz < 0) { 2023eda14cbcSMatt Macy if (do_not_ignore_invalid) { 2024eda14cbcSMatt Macy *errnum = EILSEQ; 2025eda14cbcSMatt Macy ret_val = (size_t)-1; 2026eda14cbcSMatt Macy break; 2027eda14cbcSMatt Macy } 2028eda14cbcSMatt Macy 2029eda14cbcSMatt Macy sz = 1; 2030eda14cbcSMatt Macy ret_val++; 2031eda14cbcSMatt Macy } 2032eda14cbcSMatt Macy 2033eda14cbcSMatt Macy if (sz == 1) { 2034eda14cbcSMatt Macy if (ob >= obtail) { 2035eda14cbcSMatt Macy *errnum = E2BIG; 2036eda14cbcSMatt Macy ret_val = (size_t)-1; 2037eda14cbcSMatt Macy break; 2038eda14cbcSMatt Macy } 2039eda14cbcSMatt Macy 2040eda14cbcSMatt Macy if (is_it_toupper) 2041eda14cbcSMatt Macy *ob = U8_ASCII_TOUPPER(*ib); 2042eda14cbcSMatt Macy else if (is_it_tolower) 2043eda14cbcSMatt Macy *ob = U8_ASCII_TOLOWER(*ib); 2044eda14cbcSMatt Macy else 2045eda14cbcSMatt Macy *ob = *ib; 2046eda14cbcSMatt Macy ib++; 2047eda14cbcSMatt Macy ob++; 2048eda14cbcSMatt Macy } else if ((ib + sz) > ibtail) { 2049eda14cbcSMatt Macy if (do_not_ignore_invalid) { 2050eda14cbcSMatt Macy *errnum = EINVAL; 2051eda14cbcSMatt Macy ret_val = (size_t)-1; 2052eda14cbcSMatt Macy break; 2053eda14cbcSMatt Macy } 2054eda14cbcSMatt Macy 2055eda14cbcSMatt Macy if ((obtail - ob) < (ibtail - ib)) { 2056eda14cbcSMatt Macy *errnum = E2BIG; 2057eda14cbcSMatt Macy ret_val = (size_t)-1; 2058eda14cbcSMatt Macy break; 2059eda14cbcSMatt Macy } 2060eda14cbcSMatt Macy 2061eda14cbcSMatt Macy /* 2062eda14cbcSMatt Macy * We treat the remaining incomplete character 2063eda14cbcSMatt Macy * bytes as a character. 2064eda14cbcSMatt Macy */ 2065eda14cbcSMatt Macy ret_val++; 2066eda14cbcSMatt Macy 2067eda14cbcSMatt Macy while (ib < ibtail) 2068eda14cbcSMatt Macy *ob++ = *ib++; 2069eda14cbcSMatt Macy } else { 2070eda14cbcSMatt Macy if (is_it_toupper || is_it_tolower) { 2071eda14cbcSMatt Macy i = do_case_conv(unicode_version, u8s, 2072eda14cbcSMatt Macy ib, sz, is_it_toupper); 2073eda14cbcSMatt Macy 2074eda14cbcSMatt Macy if ((obtail - ob) < i) { 2075eda14cbcSMatt Macy *errnum = E2BIG; 2076eda14cbcSMatt Macy ret_val = (size_t)-1; 2077eda14cbcSMatt Macy break; 2078eda14cbcSMatt Macy } 2079eda14cbcSMatt Macy 2080eda14cbcSMatt Macy ib += sz; 2081eda14cbcSMatt Macy 2082eda14cbcSMatt Macy for (sz = 0; sz < i; sz++) 2083eda14cbcSMatt Macy *ob++ = u8s[sz]; 2084eda14cbcSMatt Macy } else { 2085eda14cbcSMatt Macy if ((obtail - ob) < sz) { 2086eda14cbcSMatt Macy *errnum = E2BIG; 2087eda14cbcSMatt Macy ret_val = (size_t)-1; 2088eda14cbcSMatt Macy break; 2089eda14cbcSMatt Macy } 2090eda14cbcSMatt Macy 2091eda14cbcSMatt Macy for (i = 0; i < sz; i++) 2092eda14cbcSMatt Macy *ob++ = *ib++; 2093eda14cbcSMatt Macy } 2094eda14cbcSMatt Macy } 2095eda14cbcSMatt Macy } 2096eda14cbcSMatt Macy } else { 2097eda14cbcSMatt Macy canonical_decomposition = flag & U8_CANON_DECOMP; 2098eda14cbcSMatt Macy compatibility_decomposition = flag & U8_COMPAT_DECOMP; 2099eda14cbcSMatt Macy canonical_composition = flag & U8_CANON_COMP; 2100eda14cbcSMatt Macy 2101eda14cbcSMatt Macy while (ib < ibtail) { 2102eda14cbcSMatt Macy if (*ib == '\0' && do_not_ignore_null) 2103eda14cbcSMatt Macy break; 2104eda14cbcSMatt Macy 2105eda14cbcSMatt Macy /* 2106eda14cbcSMatt Macy * If the current character is a 7-bit ASCII 2107eda14cbcSMatt Macy * character and it is the last character, or, 2108eda14cbcSMatt Macy * if the current character is a 7-bit ASCII 2109eda14cbcSMatt Macy * character and the next character is also a 7-bit 2110eda14cbcSMatt Macy * ASCII character, then, we copy over this 2111eda14cbcSMatt Macy * character without going through collect_a_seq(). 2112eda14cbcSMatt Macy * 2113eda14cbcSMatt Macy * In any other cases, we need to look further with 2114eda14cbcSMatt Macy * the collect_a_seq() function. 2115eda14cbcSMatt Macy */ 2116eda14cbcSMatt Macy if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || 2117eda14cbcSMatt Macy ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { 2118eda14cbcSMatt Macy if (ob >= obtail) { 2119eda14cbcSMatt Macy *errnum = E2BIG; 2120eda14cbcSMatt Macy ret_val = (size_t)-1; 2121eda14cbcSMatt Macy break; 2122eda14cbcSMatt Macy } 2123eda14cbcSMatt Macy 2124eda14cbcSMatt Macy if (is_it_toupper) 2125eda14cbcSMatt Macy *ob = U8_ASCII_TOUPPER(*ib); 2126eda14cbcSMatt Macy else if (is_it_tolower) 2127eda14cbcSMatt Macy *ob = U8_ASCII_TOLOWER(*ib); 2128eda14cbcSMatt Macy else 2129eda14cbcSMatt Macy *ob = *ib; 2130eda14cbcSMatt Macy ib++; 2131eda14cbcSMatt Macy ob++; 2132eda14cbcSMatt Macy } else { 2133eda14cbcSMatt Macy *errnum = 0; 2134eda14cbcSMatt Macy state = U8_STATE_START; 2135eda14cbcSMatt Macy 2136eda14cbcSMatt Macy j = collect_a_seq(unicode_version, u8s, 2137eda14cbcSMatt Macy &ib, ibtail, 2138eda14cbcSMatt Macy is_it_toupper, 2139eda14cbcSMatt Macy is_it_tolower, 2140eda14cbcSMatt Macy canonical_decomposition, 2141eda14cbcSMatt Macy compatibility_decomposition, 2142eda14cbcSMatt Macy canonical_composition, 2143eda14cbcSMatt Macy errnum, &state); 2144eda14cbcSMatt Macy 2145eda14cbcSMatt Macy if (*errnum && do_not_ignore_invalid) { 2146eda14cbcSMatt Macy ret_val = (size_t)-1; 2147eda14cbcSMatt Macy break; 2148eda14cbcSMatt Macy } 2149eda14cbcSMatt Macy 2150eda14cbcSMatt Macy if ((obtail - ob) < j) { 2151eda14cbcSMatt Macy *errnum = E2BIG; 2152eda14cbcSMatt Macy ret_val = (size_t)-1; 2153eda14cbcSMatt Macy break; 2154eda14cbcSMatt Macy } 2155eda14cbcSMatt Macy 2156eda14cbcSMatt Macy for (i = 0; i < j; i++) 2157eda14cbcSMatt Macy *ob++ = u8s[i]; 2158eda14cbcSMatt Macy } 2159eda14cbcSMatt Macy } 2160eda14cbcSMatt Macy } 2161eda14cbcSMatt Macy 2162eda14cbcSMatt Macy *inlen = ibtail - ib; 2163eda14cbcSMatt Macy *outlen = obtail - ob; 2164eda14cbcSMatt Macy 2165eda14cbcSMatt Macy return (ret_val); 2166eda14cbcSMatt Macy } 2167eda14cbcSMatt Macy 2168eda14cbcSMatt Macy EXPORT_SYMBOL(u8_validate); 2169eda14cbcSMatt Macy EXPORT_SYMBOL(u8_strcmp); 2170eda14cbcSMatt Macy EXPORT_SYMBOL(u8_textprep_str); 2171