common/unicode/u8_textprep.c

5049Sis/*
5049Sis * CDDL HEADER START
5049Sis *
5049Sis * The contents of this file are subject to the terms of the
5049Sis * Common Development and Distribution License (the "License").
5049Sis * You may not use this file except in compliance with the License.
5049Sis *
5049Sis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
5049Sis * or http://www.opensolaris.org/os/licensing.
5049Sis * See the License for the specific language governing permissions
5049Sis * and limitations under the License.
5049Sis *
5049Sis * When distributing Covered Code, include this CDDL HEADER in each
5049Sis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
5049Sis * If applicable, add the following below this CDDL HEADER, with the
5049Sis * fields enclosed by brackets "[]" replaced with your own identifying
5049Sis * information: Portions Copyright [yyyy] [name of copyright owner]
5049Sis *
5049Sis * CDDL HEADER END
5049Sis */
5049Sis/*
*7012Sis * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
5049Sis * Use is subject to license terms.
5049Sis */
5049Sis
5049Sis#pragma ident	"%Z%%M%	%I%	%E% SMI"
5049Sis
5049Sis
5049Sis/*
5049Sis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
5049Sis *
5049Sis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
5049Sis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
5049Sis * the section 3C man pages.
5049Sis * Interface stability: Committed.
5049Sis */
5049Sis
5049Sis#include <sys/types.h>
5049Sis#ifdef	_KERNEL
5049Sis#include <sys/param.h>
5049Sis#include <sys/sysmacros.h>
5049Sis#include <sys/systm.h>
5049Sis#include <sys/debug.h>
5049Sis#include <sys/kmem.h>
5049Sis#include <sys/ddi.h>
5049Sis#include <sys/sunddi.h>
5049Sis#else
5049Sis#include <sys/u8_textprep.h>
5049Sis#include <strings.h>
5049Sis#endif	/* _KERNEL */
5049Sis#include <sys/byteorder.h>
5049Sis#include <sys/errno.h>
5049Sis#include <sys/u8_textprep_data.h>
5049Sis
5049Sis
5049Sis/* The maximum possible number of bytes in a UTF-8 character. */
5049Sis#define	U8_MB_CUR_MAX			(4)
5049Sis
5049Sis/*
5049Sis * The maximum number of bytes needed for a UTF-8 character to cover
5049Sis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
5049Sis */
5049Sis#define	U8_MAX_BYTES_UCS2		(3)
5049Sis
5049Sis/* The maximum possible number of bytes in a Stream-Safe Text. */
5049Sis#define	U8_STREAM_SAFE_TEXT_MAX		(128)
5049Sis
5049Sis/*
5049Sis * The maximum number of characters in a combining/conjoining sequence and
5049Sis * the actual upperbound limit of a combining/conjoining sequence.
5049Sis */
5049Sis#define	U8_MAX_CHARS_A_SEQ		(32)
5049Sis#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
5049Sis
5049Sis/* The combining class value for Starter. */
5049Sis#define	U8_COMBINING_CLASS_STARTER	(0)
5049Sis
5049Sis/*
5049Sis * Some Hangul related macros at below.
5049Sis *
5049Sis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
5049Sis * Vowels, and optional Trailing consonants in Unicode scalar values.
5049Sis *
5049Sis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
5049Sis * the actual U+11A8. This is due to that the trailing consonant is optional
5049Sis * and thus we are doing a pre-calculation of subtracting one.
5049Sis *
5049Sis * Each of 19 modern leading consonants has total 588 possible syllables since
5049Sis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
5049Sis * no trailing consonant case, i.e., 21 x 28 = 588.
5049Sis *
5049Sis * We also have bunch of Hangul related macros at below. Please bear in mind
5049Sis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
5049Sis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
5049Sis * Jamo; it just guarantee that it will be most likely.
5049Sis */
5049Sis#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
5049Sis#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
5049Sis
5049Sis#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
5049Sis#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
5049Sis#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
5049Sis#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
5049Sis#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
5049Sis#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
5049Sis
5049Sis#define	U8_HANGUL_V_COUNT		(21)
5049Sis#define	U8_HANGUL_VT_COUNT		(588)
5049Sis#define	U8_HANGUL_T_COUNT		(28)
5049Sis
5049Sis#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
5049Sis
5049Sis#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
5049Sis	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
5049Sis	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
5049Sis	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
5049Sis
5049Sis#define	U8_HANGUL_JAMO_L(u) \
5049Sis	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
5049Sis
5049Sis#define	U8_HANGUL_JAMO_V(u) \
5049Sis	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
5049Sis
5049Sis#define	U8_HANGUL_JAMO_T(u) \
5049Sis	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
5049Sis
5049Sis#define	U8_HANGUL_JAMO(u) \
5049Sis	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
5049Sis
5049Sis#define	U8_HANGUL_SYLLABLE(u) \
5049Sis	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
5049Sis
5049Sis#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
5049Sis	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
5049Sis
5049Sis#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
5049Sis	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
5049Sis
5049Sis/* The types of decomposition mappings. */
5049Sis#define	U8_DECOMP_BOTH			(0xF5U)
5049Sis#define	U8_DECOMP_CANONICAL		(0xF6U)
5049Sis
5049Sis/* The indicator for 16-bit table. */
5049Sis#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
5049Sis
5049Sis/* The following are some convenience macros. */
5049Sis#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
5049Sis	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
5049Sis		(uint32_t)(b3) & 0x3F;
5049Sis
5049Sis#define	U8_SIMPLE_SWAP(a, b, t) \
5049Sis	(t) = (a); \
5049Sis	(a) = (b); \
5049Sis	(b) = (t);
5049Sis
5049Sis#define	U8_ASCII_TOUPPER(c) \
5049Sis	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
5049Sis
5049Sis#define	U8_ASCII_TOLOWER(c) \
5049Sis	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
5049Sis
5049Sis#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
5049Sis/*
5049Sis * The following macro assumes that the two characters that are to be
5049Sis * swapped are adjacent to each other and 'a' comes before 'b'.
5049Sis *
5049Sis * If the assumptions are not met, then, the macro will fail.
5049Sis */
5049Sis#define	U8_SWAP_COMB_MARKS(a, b) \
5049Sis	for (k = 0; k < disp[(a)]; k++) \
5049Sis		u8t[k] = u8s[start[(a)] + k]; \
5049Sis	for (k = 0; k < disp[(b)]; k++) \
5049Sis		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
5049Sis	start[(b)] = start[(a)] + disp[(b)]; \
5049Sis	for (k = 0; k < disp[(a)]; k++) \
5049Sis		u8s[start[(b)] + k] = u8t[k]; \
5049Sis	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
5049Sis	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
5049Sis
5049Sis/* The possible states during normalization. */
5049Sistypedef enum {
5049Sis	U8_STATE_START = 0,
5049Sis	U8_STATE_HANGUL_L = 1,
5049Sis	U8_STATE_HANGUL_LV = 2,
5049Sis	U8_STATE_HANGUL_LVT = 3,
5049Sis	U8_STATE_HANGUL_V = 4,
5049Sis	U8_STATE_HANGUL_T = 5,
5049Sis	U8_STATE_COMBINING_MARK = 6
5049Sis} u8_normalization_states_t;
5049Sis
5049Sis/*
5049Sis * The three vectors at below are used to check bytes of a given UTF-8
5049Sis * character are valid and not containing any malformed byte values.
5049Sis *
5049Sis * We used to have a quite relaxed UTF-8 binary representation but then there
5049Sis * was some security related issues and so the Unicode Consortium defined
5049Sis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
5049Sis * one more time at the Unicode 3.2. The following three tables are based on
5049Sis * that.
5049Sis */
5049Sis
5049Sis#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
5049Sis
5049Sis#define	I_				U8_ILLEGAL_CHAR
5049Sis#define	O_				U8_OUT_OF_RANGE_CHAR
5049Sis
5049Sisconst int8_t u8_number_of_bytes[0x100] = {
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5049Sis
5049Sis/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
5049Sis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
5049Sis
5049Sis/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
5049Sis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
5049Sis
5049Sis/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
5049Sis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
5049Sis
5049Sis/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
5049Sis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
5049Sis
5049Sis/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
5049Sis	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
5049Sis
5049Sis/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
5049Sis	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
5049Sis
5049Sis/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
5049Sis	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
5049Sis
5049Sis/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
5049Sis	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
5049Sis};
5049Sis
5049Sis#undef	I_
5049Sis#undef	O_
5049Sis
5049Sisconst uint8_t u8_valid_min_2nd_byte[0x100] = {
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis/*	C0    C1    C2    C3    C4    C5    C6    C7    */
5049Sis	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	C8    C9    CA    CB    CC    CD    CE    CF    */
5049Sis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	D0    D1    D2    D3    D4    D5    D6    D7    */
5049Sis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	D8    D9    DA    DB    DC    DD    DE    DF    */
5049Sis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	E0    E1    E2    E3    E4    E5    E6    E7    */
5049Sis	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	E8    E9    EA    EB    EC    ED    EE    EF    */
5049Sis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
5049Sis/*	F0    F1    F2    F3    F4    F5    F6    F7    */
5049Sis	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis};
5049Sis
5049Sisconst uint8_t u8_valid_max_2nd_byte[0x100] = {
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis/*	C0    C1    C2    C3    C4    C5    C6    C7    */
5049Sis	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
5049Sis/*	C8    C9    CA    CB    CC    CD    CE    CF    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
5049Sis/*	D0    D1    D2    D3    D4    D5    D6    D7    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
5049Sis/*	D8    D9    DA    DB    DC    DD    DE    DF    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
5049Sis/*	E0    E1    E2    E3    E4    E5    E6    E7    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
5049Sis/*	E8    E9    EA    EB    EC    ED    EE    EF    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
5049Sis/*	F0    F1    F2    F3    F4    F5    F6    F7    */
5049Sis	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
5049Sis	0,    0,    0,    0,    0,    0,    0,    0,
5049Sis};
5049Sis
5049Sis
5049Sis/*
5049Sis * The u8_validate() validates on the given UTF-8 character string and
5049Sis * calculate the byte length. It is quite similar to mblen(3C) except that
5049Sis * this will validate against the list of characters if required and
5049Sis * specific to UTF-8 and Unicode.
5049Sis */
5049Sisint
*7012Sisu8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
5049Sis{
5049Sis	uchar_t *ib;
5049Sis	uchar_t *ibtail;
5049Sis	uchar_t **p;
5049Sis	uchar_t *s1;
5049Sis	uchar_t *s2;
5049Sis	uchar_t f;
5049Sis	int sz;
5049Sis	size_t i;
5049Sis	int ret_val;
5049Sis	boolean_t second;
5049Sis	boolean_t no_need_to_validate_entire;
5049Sis	boolean_t check_additional;
5049Sis	boolean_t validate_ucs2_range_only;
5049Sis
5049Sis	if (! u8str)
5049Sis		return (0);
5049Sis
5049Sis	ib = (uchar_t *)u8str;
5049Sis	ibtail = ib + n;
5049Sis
5049Sis	ret_val = 0;
5049Sis
5049Sis	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
5049Sis	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
5049Sis	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
5049Sis
5049Sis	while (ib < ibtail) {
5049Sis		/*
5049Sis		 * The first byte of a UTF-8 character tells how many
5049Sis		 * bytes will follow for the character. If the first byte
5049Sis		 * is an illegal byte value or out of range value, we just
5049Sis		 * return -1 with an appropriate error number.
5049Sis		 */
5049Sis		sz = u8_number_of_bytes[*ib];
5049Sis		if (sz == U8_ILLEGAL_CHAR) {
*7012Sis			*errnum = EILSEQ;
5049Sis			return (-1);
5049Sis		}
5049Sis
5049Sis		if (sz == U8_OUT_OF_RANGE_CHAR ||
5049Sis		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
*7012Sis			*errnum = ERANGE;
5049Sis			return (-1);
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * If we don't have enough bytes to check on, that's also
5049Sis		 * an error. As you can see, we give illegal byte sequence
5049Sis		 * checking higher priority then EINVAL cases.
5049Sis		 */
5049Sis		if ((ibtail - ib) < sz) {
*7012Sis			*errnum = EINVAL;
5049Sis			return (-1);
5049Sis		}
5049Sis
5049Sis		if (sz == 1) {
5049Sis			ib++;
5049Sis			ret_val++;
5049Sis		} else {
5049Sis			/*
5049Sis			 * Check on the multi-byte UTF-8 character. For more
5049Sis			 * details on this, see comment added for the used
5049Sis			 * data structures at the beginning of the file.
5049Sis			 */
5049Sis			f = *ib++;
5049Sis			ret_val++;
5049Sis			second = B_TRUE;
5049Sis			for (i = 1; i < sz; i++) {
5049Sis				if (second) {
5049Sis					if (*ib < u8_valid_min_2nd_byte[f] ||
5049Sis					    *ib > u8_valid_max_2nd_byte[f]) {
*7012Sis						*errnum = EILSEQ;
5049Sis						return (-1);
5049Sis					}
5049Sis					second = B_FALSE;
5049Sis				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
*7012Sis					*errnum = EILSEQ;
5049Sis					return (-1);
5049Sis				}
5049Sis				ib++;
5049Sis				ret_val++;
5049Sis			}
5049Sis		}
5049Sis
5049Sis		if (check_additional) {
5049Sis			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
5049Sis				s1 = ib - sz;
5049Sis				s2 = p[i];
5049Sis				while (s1 < ib) {
5049Sis					if (*s1 != *s2 || *s2 == '\0')
5049Sis						break;
5049Sis					s1++;
5049Sis					s2++;
5049Sis				}
5049Sis
5049Sis				if (s1 >= ib && *s2 == '\0') {
*7012Sis					*errnum = EBADF;
5049Sis					return (-1);
5049Sis				}
5049Sis			}
5049Sis		}
5049Sis
5049Sis		if (no_need_to_validate_entire)
5049Sis			break;
5049Sis	}
5049Sis
5049Sis	return (ret_val);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The do_case_conv() looks at the mapping tables and returns found
5049Sis * bytes if any. If not found, the input bytes are returned. The function
5049Sis * always terminate the return bytes with a null character assuming that
5049Sis * there are plenty of room to do so.
5049Sis *
5049Sis * The case conversions are simple case conversions mapping a character to
5049Sis * another character as specified in the Unicode data. The byte size of
5049Sis * the mapped character could be different from that of the input character.
5049Sis *
5049Sis * The return value is the byte length of the returned character excluding
5049Sis * the terminating null byte.
5049Sis */
5049Sisstatic size_t
5049Sisdo_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
5049Sis{
5049Sis	size_t i;
5049Sis	uint16_t b1 = 0;
5049Sis	uint16_t b2 = 0;
5049Sis	uint16_t b3 = 0;
5049Sis	uint16_t b3_tbl;
5049Sis	uint16_t b3_base;
5049Sis	uint16_t b4 = 0;
5049Sis	size_t start_id;
5049Sis	size_t end_id;
5049Sis
5049Sis	/*
5049Sis	 * At this point, the only possible values for sz are 2, 3, and 4.
5049Sis	 * The u8s should point to a vector that is well beyond the size of
5049Sis	 * 5 bytes.
5049Sis	 */
5049Sis	if (sz == 2) {
5049Sis		b3 = u8s[0] = s[0];
5049Sis		b4 = u8s[1] = s[1];
5049Sis	} else if (sz == 3) {
5049Sis		b2 = u8s[0] = s[0];
5049Sis		b3 = u8s[1] = s[1];
5049Sis		b4 = u8s[2] = s[2];
5049Sis	} else if (sz == 4) {
5049Sis		b1 = u8s[0] = s[0];
5049Sis		b2 = u8s[1] = s[1];
5049Sis		b3 = u8s[2] = s[2];
5049Sis		b4 = u8s[3] = s[3];
5049Sis	} else {
5049Sis		/* This is not possible but just in case as a fallback. */
5049Sis		if (is_it_toupper)
5049Sis			*u8s = U8_ASCII_TOUPPER(*s);
5049Sis		else
5049Sis			*u8s = U8_ASCII_TOLOWER(*s);
5049Sis		u8s[1] = '\0';
5049Sis
5049Sis		return (1);
5049Sis	}
5049Sis	u8s[sz] = '\0';
5049Sis
5049Sis	/*
5049Sis	 * Let's find out if we have a corresponding character.
5049Sis	 */
5049Sis	b1 = u8_common_b1_tbl[uv][b1];
5049Sis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	b2 = u8_case_common_b2_tbl[uv][b1][b2];
5049Sis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	if (is_it_toupper) {
5049Sis		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5049Sis		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5049Sis			return ((size_t)sz);
5049Sis
5049Sis		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5049Sis
5049Sis		/* Either there is no match or an error at the table. */
5049Sis		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5049Sis			return ((size_t)sz);
5049Sis
5049Sis		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5049Sis
5049Sis		for (i = 0; start_id < end_id; start_id++)
5049Sis			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5049Sis	} else {
5049Sis		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5049Sis		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5049Sis			return ((size_t)sz);
5049Sis
5049Sis		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5049Sis
5049Sis		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5049Sis			return ((size_t)sz);
5049Sis
5049Sis		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5049Sis
5049Sis		for (i = 0; start_id < end_id; start_id++)
5049Sis			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * If i is still zero, that means there is no corresponding character.
5049Sis	 */
5049Sis	if (i == 0)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	u8s[i] = '\0';
5049Sis
5049Sis	return (i);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The do_case_compare() function compares the two input strings, s1 and s2,
5049Sis * one character at a time doing case conversions if applicable and return
5049Sis * the comparison result as like strcmp().
5049Sis *
5049Sis * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5049Sis * we treat the 7-bit ASCII characters as a special case trying to yield
5049Sis * faster processing time.
5049Sis */
5049Sisstatic int
5049Sisdo_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
*7012Sis	size_t n2, boolean_t is_it_toupper, int *errnum)
5049Sis{
5049Sis	int f;
5049Sis	int sz1;
5049Sis	int sz2;
5049Sis	size_t j;
5049Sis	size_t i1;
5049Sis	size_t i2;
5049Sis	uchar_t u8s1[U8_MB_CUR_MAX + 1];
5049Sis	uchar_t u8s2[U8_MB_CUR_MAX + 1];
5049Sis
5049Sis	i1 = i2 = 0;
5049Sis	while (i1 < n1 && i2 < n2) {
5049Sis		/*
5049Sis		 * Find out what would be the byte length for this UTF-8
5049Sis		 * character at string s1 and also find out if this is
5049Sis		 * an illegal start byte or not and if so, issue a proper
*7012Sis		 * error number and yet treat this byte as a character.
5049Sis		 */
5049Sis		sz1 = u8_number_of_bytes[*s1];
5049Sis		if (sz1 < 0) {
*7012Sis			*errnum = EILSEQ;
5049Sis			sz1 = 1;
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * For 7-bit ASCII characters mainly, we do a quick case
5049Sis		 * conversion right at here.
5049Sis		 *
5049Sis		 * If we don't have enough bytes for this character, issue
5049Sis		 * an EINVAL error and use what are available.
5049Sis		 *
5049Sis		 * If we have enough bytes, find out if there is
5049Sis		 * a corresponding uppercase character and if so, copy over
5049Sis		 * the bytes for a comparison later. If there is no
5049Sis		 * corresponding uppercase character, then, use what we have
5049Sis		 * for the comparison.
5049Sis		 */
5049Sis		if (sz1 == 1) {
5049Sis			if (is_it_toupper)
5049Sis				u8s1[0] = U8_ASCII_TOUPPER(*s1);
5049Sis			else
5049Sis				u8s1[0] = U8_ASCII_TOLOWER(*s1);
5049Sis			s1++;
5049Sis			u8s1[1] = '\0';
5049Sis		} else if ((i1 + sz1) > n1) {
*7012Sis			*errnum = EINVAL;
5049Sis			for (j = 0; (i1 + j) < n1; )
5049Sis				u8s1[j++] = *s1++;
5049Sis			u8s1[j] = '\0';
5049Sis		} else {
5049Sis			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
5049Sis			s1 += sz1;
5049Sis		}
5049Sis
5049Sis		/* Do the same for the string s2. */
5049Sis		sz2 = u8_number_of_bytes[*s2];
5049Sis		if (sz2 < 0) {
*7012Sis			*errnum = EILSEQ;
5049Sis			sz2 = 1;
5049Sis		}
5049Sis
5049Sis		if (sz2 == 1) {
5049Sis			if (is_it_toupper)
5049Sis				u8s2[0] = U8_ASCII_TOUPPER(*s2);
5049Sis			else
5049Sis				u8s2[0] = U8_ASCII_TOLOWER(*s2);
5049Sis			s2++;
5049Sis			u8s2[1] = '\0';
5049Sis		} else if ((i2 + sz2) > n2) {
*7012Sis			*errnum = EINVAL;
5049Sis			for (j = 0; (i2 + j) < n2; )
5049Sis				u8s2[j++] = *s2++;
5049Sis			u8s2[j] = '\0';
5049Sis		} else {
5049Sis			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
5049Sis			s2 += sz2;
5049Sis		}
5049Sis
5049Sis		/* Now compare the two characters. */
5049Sis		if (sz1 == 1 && sz2 == 1) {
5049Sis			if (*u8s1 > *u8s2)
5049Sis				return (1);
5049Sis			if (*u8s1 < *u8s2)
5049Sis				return (-1);
5049Sis		} else {
5049Sis			f = strcmp((const char *)u8s1, (const char *)u8s2);
5049Sis			if (f != 0)
5049Sis				return (f);
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * They were the same. Let's move on to the next
5049Sis		 * characters then.
5049Sis		 */
5049Sis		i1 += sz1;
5049Sis		i2 += sz2;
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * We compared until the end of either or both strings.
5049Sis	 *
5049Sis	 * If we reached to or went over the ends for the both, that means
5049Sis	 * they are the same.
5049Sis	 *
5049Sis	 * If we reached only one of the two ends, that means the other string
5049Sis	 * has something which then the fact can be used to determine
5049Sis	 * the return value.
5049Sis	 */
5049Sis	if (i1 >= n1) {
5049Sis		if (i2 >= n2)
5049Sis			return (0);
5049Sis		return (-1);
5049Sis	}
5049Sis	return (1);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The combining_class() function checks on the given bytes and find out
5049Sis * the corresponding Unicode combining class value. The return value 0 means
5049Sis * it is a Starter. Any illegal UTF-8 character will also be treated as
5049Sis * a Starter.
5049Sis */
5049Sisstatic uchar_t
5049Siscombining_class(size_t uv, uchar_t *s, size_t sz)
5049Sis{
5049Sis	uint16_t b1 = 0;
5049Sis	uint16_t b2 = 0;
5049Sis	uint16_t b3 = 0;
5049Sis	uint16_t b4 = 0;
5049Sis
5049Sis	if (sz == 1 || sz > 4)
5049Sis		return (0);
5049Sis
5049Sis	if (sz == 2) {
5049Sis		b3 = s[0];
5049Sis		b4 = s[1];
5049Sis	} else if (sz == 3) {
5049Sis		b2 = s[0];
5049Sis		b3 = s[1];
5049Sis		b4 = s[2];
5049Sis	} else if (sz == 4) {
5049Sis		b1 = s[0];
5049Sis		b2 = s[1];
5049Sis		b3 = s[2];
5049Sis		b4 = s[3];
5049Sis	}
5049Sis
5049Sis	b1 = u8_common_b1_tbl[uv][b1];
5049Sis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (0);
5049Sis
5049Sis	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
5049Sis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (0);
5049Sis
5049Sis	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
5049Sis	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (0);
5049Sis
5049Sis	return (u8_combining_class_b4_tbl[uv][b3][b4]);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The do_decomp() function finds out a matching decomposition if any
5049Sis * and return. If there is no match, the input bytes are copied and returned.
5049Sis * The function also checks if there is a Hangul, decomposes it if necessary
5049Sis * and returns.
5049Sis *
5049Sis * To save time, a single byte 7-bit ASCII character should be handled by
5049Sis * the caller.
5049Sis *
5049Sis * The function returns the number of bytes returned sans always terminating
5049Sis * the null byte. It will also return a state that will tell if there was
5049Sis * a Hangul character decomposed which then will be used by the caller.
5049Sis */
5049Sisstatic size_t
5049Sisdo_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
5049Sis	boolean_t canonical_decomposition, u8_normalization_states_t *state)
5049Sis{
5049Sis	uint16_t b1 = 0;
5049Sis	uint16_t b2 = 0;
5049Sis	uint16_t b3 = 0;
5049Sis	uint16_t b3_tbl;
5049Sis	uint16_t b3_base;
5049Sis	uint16_t b4 = 0;
5049Sis	size_t start_id;
5049Sis	size_t end_id;
5049Sis	size_t i;
5049Sis	uint32_t u1;
5049Sis
5049Sis	if (sz == 2) {
5049Sis		b3 = u8s[0] = s[0];
5049Sis		b4 = u8s[1] = s[1];
5049Sis		u8s[2] = '\0';
5049Sis	} else if (sz == 3) {
5049Sis		/* Convert it to a Unicode scalar value. */
5049Sis		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
5049Sis
5049Sis		/*
5049Sis		 * If this is a Hangul syllable, we decompose it into
5049Sis		 * a leading consonant, a vowel, and an optional trailing
5049Sis		 * consonant and then return.
5049Sis		 */
5049Sis		if (U8_HANGUL_SYLLABLE(u1)) {
5049Sis			u1 -= U8_HANGUL_SYL_FIRST;
5049Sis
5049Sis			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
5049Sis			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
5049Sis			    / U8_HANGUL_T_COUNT;
5049Sis			b3 = u1 % U8_HANGUL_T_COUNT;
5049Sis
5049Sis			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
5049Sis			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
5049Sis			if (b3) {
5049Sis				b3 += U8_HANGUL_JAMO_T_FIRST;
5049Sis				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
5049Sis
5049Sis				u8s[9] = '\0';
5049Sis				*state = U8_STATE_HANGUL_LVT;
5049Sis				return (9);
5049Sis			}
5049Sis
5049Sis			u8s[6] = '\0';
5049Sis			*state = U8_STATE_HANGUL_LV;
5049Sis			return (6);
5049Sis		}
5049Sis
5049Sis		b2 = u8s[0] = s[0];
5049Sis		b3 = u8s[1] = s[1];
5049Sis		b4 = u8s[2] = s[2];
5049Sis		u8s[3] = '\0';
5049Sis
5049Sis		/*
5049Sis		 * If this is a Hangul Jamo, we know there is nothing
5049Sis		 * further that we can decompose.
5049Sis		 */
5049Sis		if (U8_HANGUL_JAMO_L(u1)) {
5049Sis			*state = U8_STATE_HANGUL_L;
5049Sis			return (3);
5049Sis		}
5049Sis
5049Sis		if (U8_HANGUL_JAMO_V(u1)) {
5049Sis			if (*state == U8_STATE_HANGUL_L)
5049Sis				*state = U8_STATE_HANGUL_LV;
5049Sis			else
5049Sis				*state = U8_STATE_HANGUL_V;
5049Sis			return (3);
5049Sis		}
5049Sis
5049Sis		if (U8_HANGUL_JAMO_T(u1)) {
5049Sis			if (*state == U8_STATE_HANGUL_LV)
5049Sis				*state = U8_STATE_HANGUL_LVT;
5049Sis			else
5049Sis				*state = U8_STATE_HANGUL_T;
5049Sis			return (3);
5049Sis		}
5049Sis	} else if (sz == 4) {
5049Sis		b1 = u8s[0] = s[0];
5049Sis		b2 = u8s[1] = s[1];
5049Sis		b3 = u8s[2] = s[2];
5049Sis		b4 = u8s[3] = s[3];
5049Sis		u8s[4] = '\0';
5049Sis	} else {
5049Sis		/*
5049Sis		 * This is a fallback and should not happen if the function
5049Sis		 * was called properly.
5049Sis		 */
5049Sis		u8s[0] = s[0];
5049Sis		u8s[1] = '\0';
5049Sis		*state = U8_STATE_START;
5049Sis		return (1);
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * At this point, this rountine does not know what it would get.
5049Sis	 * The caller should sort it out if the state isn't a Hangul one.
5049Sis	 */
5049Sis	*state = U8_STATE_START;
5049Sis
5049Sis	/* Try to find matching decomposition mapping byte sequence. */
5049Sis	b1 = u8_common_b1_tbl[uv][b1];
5049Sis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	b2 = u8_decomp_b2_tbl[uv][b1][b2];
5049Sis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
5049Sis	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	/*
5049Sis	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
5049Sis	 * which is 0x8000, this means we couldn't fit the mappings into
5049Sis	 * the cardinality of a unsigned byte.
5049Sis	 */
5049Sis	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
5049Sis		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
5049Sis		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
5049Sis	} else {
5049Sis		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
5049Sis	}
5049Sis
5049Sis	/* This also means there wasn't any matching decomposition. */
5049Sis	if (start_id >= end_id)
5049Sis		return ((size_t)sz);
5049Sis
5049Sis	/*
5049Sis	 * The final table for decomposition mappings has three types of
5049Sis	 * byte sequences depending on whether a mapping is for compatibility
5049Sis	 * decomposition, canonical decomposition, or both like the following:
5049Sis	 *
5049Sis	 * (1) Compatibility decomposition mappings:
5049Sis	 *
5049Sis	 *	+---+---+-...-+---+
5049Sis	 *	| B0| B1| ... | Bm|
5049Sis	 *	+---+---+-...-+---+
5049Sis	 *
5049Sis	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
5049Sis	 *
5049Sis	 * (2) Canonical decomposition mappings:
5049Sis	 *
5049Sis	 *	+---+---+---+-...-+---+
5049Sis	 *	| T | b0| b1| ... | bn|
5049Sis	 *	+---+---+---+-...-+---+
5049Sis	 *
5049Sis	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
5049Sis	 *
5049Sis	 * (3) Both mappings:
5049Sis	 *
5049Sis	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
5049Sis	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
5049Sis	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
5049Sis	 *
5049Sis	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
5049Sis	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
5049Sis	 *	compatibility mapping bytes.
5049Sis	 *
5049Sis	 * Note that compatibility decomposition means doing recursive
5049Sis	 * decompositions using both compatibility decomposition mappings and
5049Sis	 * canonical decomposition mappings. On the other hand, canonical
5049Sis	 * decomposition means doing recursive decompositions using only
5049Sis	 * canonical decomposition mappings. Since the table we have has gone
5049Sis	 * through the recursions already, we do not need to do so during
5049Sis	 * runtime, i.e., the table has been completely flattened out
5049Sis	 * already.
5049Sis	 */
5049Sis
5049Sis	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
5049Sis
5049Sis	/* Get the type, T, of the byte sequence. */
5049Sis	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
5049Sis
5049Sis	/*
5049Sis	 * If necessary, adjust start_id, end_id, or both. Note that if
5049Sis	 * this is compatibility decomposition mapping, there is no
5049Sis	 * adjustment.
5049Sis	 */
5049Sis	if (canonical_decomposition) {
5049Sis		/* Is the mapping only for compatibility decomposition? */
5049Sis		if (b1 < U8_DECOMP_BOTH)
5049Sis			return ((size_t)sz);
5049Sis
5049Sis		start_id++;
5049Sis
5049Sis		if (b1 == U8_DECOMP_BOTH) {
5049Sis			end_id = start_id +
5049Sis			    u8_decomp_final_tbl[uv][b3_base + start_id];
5049Sis			start_id++;
5049Sis		}
5049Sis	} else {
5049Sis		/*
5049Sis		 * Unless this is a compatibility decomposition mapping,
5049Sis		 * we adjust the start_id.
5049Sis		 */
5049Sis		if (b1 == U8_DECOMP_BOTH) {
5049Sis			start_id++;
5049Sis			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
5049Sis		} else if (b1 == U8_DECOMP_CANONICAL) {
5049Sis			start_id++;
5049Sis		}
5049Sis	}
5049Sis
5049Sis	for (i = 0; start_id < end_id; start_id++)
5049Sis		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
5049Sis	u8s[i] = '\0';
5049Sis
5049Sis	return (i);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The find_composition_start() function uses the character bytes given and
5049Sis * find out the matching composition mappings if any and return the address
5049Sis * to the composition mappings as explained in the do_composition().
5049Sis */
5049Sisstatic uchar_t *
5049Sisfind_composition_start(size_t uv, uchar_t *s, size_t sz)
5049Sis{
5049Sis	uint16_t b1 = 0;
5049Sis	uint16_t b2 = 0;
5049Sis	uint16_t b3 = 0;
5049Sis	uint16_t b3_tbl;
5049Sis	uint16_t b3_base;
5049Sis	uint16_t b4 = 0;
5049Sis	size_t start_id;
5049Sis	size_t end_id;
5049Sis
5049Sis	if (sz == 1) {
5049Sis		b4 = s[0];
5049Sis	} else if (sz == 2) {
5049Sis		b3 = s[0];
5049Sis		b4 = s[1];
5049Sis	} else if (sz == 3) {
5049Sis		b2 = s[0];
5049Sis		b3 = s[1];
5049Sis		b4 = s[2];
5049Sis	} else if (sz == 4) {
5049Sis		b1 = s[0];
5049Sis		b2 = s[1];
5049Sis		b3 = s[2];
5049Sis		b4 = s[3];
5049Sis	} else {
5049Sis		/*
5049Sis		 * This is a fallback and should not happen if the function
5049Sis		 * was called properly.
5049Sis		 */
5049Sis		return (NULL);
5049Sis	}
5049Sis
5049Sis	b1 = u8_composition_b1_tbl[uv][b1];
5049Sis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (NULL);
5049Sis
5049Sis	b2 = u8_composition_b2_tbl[uv][b1][b2];
5049Sis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (NULL);
5049Sis
5049Sis	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
5049Sis	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5049Sis		return (NULL);
5049Sis
5049Sis	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
5049Sis		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
5049Sis		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
5049Sis	} else {
5049Sis		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
5049Sis		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
5049Sis	}
5049Sis
5049Sis	if (start_id >= end_id)
5049Sis		return (NULL);
5049Sis
5049Sis	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
5049Sis
5049Sis	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
5049Sis}
5049Sis
5049Sis/*
5049Sis * The blocked() function checks on the combining class values of previous
5049Sis * characters in this sequence and return whether it is blocked or not.
5049Sis */
5049Sisstatic boolean_t
5049Sisblocked(uchar_t *comb_class, size_t last)
5049Sis{
5049Sis	uchar_t my_comb_class;
5049Sis	size_t i;
5049Sis
5049Sis	my_comb_class = comb_class[last];
5049Sis	for (i = 1; i < last; i++)
5049Sis		if (comb_class[i] >= my_comb_class ||
5049Sis		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
5049Sis			return (B_TRUE);
5049Sis
5049Sis	return (B_FALSE);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The do_composition() reads the character string pointed by 's' and
5049Sis * do necessary canonical composition and then copy over the result back to
5049Sis * the 's'.
5049Sis *
5049Sis * The input argument 's' cannot contain more than 32 characters.
5049Sis */
5049Sisstatic size_t
5049Sisdo_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
5049Sis	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
5049Sis{
5049Sis	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
5049Sis	uchar_t tc[U8_MB_CUR_MAX];
5049Sis	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
5049Sis	size_t saved_marks_count;
5049Sis	uchar_t *p;
5049Sis	uchar_t *saved_p;
5049Sis	uchar_t *q;
5049Sis	size_t i;
5049Sis	size_t saved_i;
5049Sis	size_t j;
5049Sis	size_t k;
5049Sis	size_t l;
5049Sis	size_t C;
5049Sis	size_t saved_l;
5049Sis	size_t size;
5049Sis	uint32_t u1;
5049Sis	uint32_t u2;
5049Sis	boolean_t match_not_found = B_TRUE;
5049Sis
5049Sis	/*
5049Sis	 * This should never happen unless the callers are doing some strange
5049Sis	 * and unexpected things.
5049Sis	 *
5049Sis	 * The "last" is the index pointing to the last character not last + 1.
5049Sis	 */
5049Sis	if (last >= U8_MAX_CHARS_A_SEQ)
5049Sis		last = U8_UPPER_LIMIT_IN_A_SEQ;
5049Sis
5049Sis	for (i = l = 0; i <= last; i++) {
5049Sis		/*
5049Sis		 * The last or any non-Starters at the beginning, we don't
5049Sis		 * have any chance to do composition and so we just copy them
5049Sis		 * to the temporary buffer.
5049Sis		 */
5049Sis		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
5049SisSAVE_THE_CHAR:
5049Sis			p = s + start[i];
5049Sis			size = disp[i];
5049Sis			for (k = 0; k < size; k++)
5049Sis				t[l++] = *p++;
5049Sis			continue;
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * If this could be a start of Hangul Jamos, then, we try to
5049Sis		 * conjoin them.
5049Sis		 */
5049Sis		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
5049Sis			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
5049Sis			    s[start[i] + 1], s[start[i] + 2]);
5049Sis			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
5049Sis			    s[start[i] + 4], s[start[i] + 5]);
5049Sis
5049Sis			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
5049Sis				u1 -= U8_HANGUL_JAMO_L_FIRST;
5049Sis				u2 -= U8_HANGUL_JAMO_V_FIRST;
5049Sis				u1 = U8_HANGUL_SYL_FIRST +
5049Sis				    (u1 * U8_HANGUL_V_COUNT + u2) *
5049Sis				    U8_HANGUL_T_COUNT;
5049Sis
5049Sis				i += 2;
5049Sis				if (i <= last) {
5049Sis					U8_PUT_3BYTES_INTO_UTF32(u2,
5049Sis					    s[start[i]], s[start[i] + 1],
5049Sis					    s[start[i] + 2]);
5049Sis
5049Sis					if (U8_HANGUL_JAMO_T(u2)) {
5049Sis						u1 += u2 -
5049Sis						    U8_HANGUL_JAMO_T_FIRST;
5049Sis						i++;
5049Sis					}
5049Sis				}
5049Sis
5049Sis				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
5049Sis				i--;
5049Sis				l += 3;
5049Sis				continue;
5049Sis			}
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * Let's then find out if this Starter has composition
5049Sis		 * mapping.
5049Sis		 */
5049Sis		p = find_composition_start(uv, s + start[i], disp[i]);
5049Sis		if (p == NULL)
5049Sis			goto SAVE_THE_CHAR;
5049Sis
5049Sis		/*
5049Sis		 * We have a Starter with composition mapping and the next
5049Sis		 * character is a non-Starter. Let's try to find out if
5049Sis		 * we can do composition.
5049Sis		 */
5049Sis
5049Sis		saved_p = p;
5049Sis		saved_i = i;
5049Sis		saved_l = l;
5049Sis		saved_marks_count = 0;
5049Sis
5049SisTRY_THE_NEXT_MARK:
5049Sis		q = s + start[++i];
5049Sis		size = disp[i];
5049Sis
5049Sis		/*
5049Sis		 * The next for() loop compares the non-Starter pointed by
5049Sis		 * 'q' with the possible (joinable) characters pointed by 'p'.
5049Sis		 *
5049Sis		 * The composition final table entry pointed by the 'p'
5049Sis		 * looks like the following:
5049Sis		 *
5049Sis		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
5049Sis		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
5049Sis		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
5049Sis		 *
5049Sis		 * where C is the count byte indicating the number of
5049Sis		 * mapping pairs where each pair would be look like
5049Sis		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
5049Sis		 * character of a canonical decomposition and the B0-Bm are
5049Sis		 * the bytes of a matching composite character. The F is
5049Sis		 * a filler byte after each character as the separator.
5049Sis		 */
5049Sis
5049Sis		match_not_found = B_TRUE;
5049Sis
5049Sis		for (C = *p++; C > 0; C--) {
5049Sis			for (k = 0; k < size; p++, k++)
5049Sis				if (*p != q[k])
5049Sis					break;
5049Sis
5049Sis			/* Have we found it? */
5049Sis			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
5049Sis				match_not_found = B_FALSE;
5049Sis
5049Sis				l = saved_l;
5049Sis
5049Sis				while (*++p != U8_TBL_ELEMENT_FILLER)
5049Sis					t[l++] = *p;
5049Sis
5049Sis				break;
5049Sis			}
5049Sis
5049Sis			/* We didn't find; skip to the next pair. */
5049Sis			if (*p != U8_TBL_ELEMENT_FILLER)
5049Sis				while (*++p != U8_TBL_ELEMENT_FILLER)
5049Sis					;
5049Sis			while (*++p != U8_TBL_ELEMENT_FILLER)
5049Sis				;
5049Sis			p++;
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * If there was no match, we will need to save the combining
5049Sis		 * mark for later appending. After that, if the next one
5049Sis		 * is a non-Starter and not blocked, then, we try once
5049Sis		 * again to do composition with the next non-Starter.
5049Sis		 *
5049Sis		 * If there was no match and this was a Starter, then,
5049Sis		 * this is a new start.
5049Sis		 *
5049Sis		 * If there was a match and a composition done and we have
5049Sis		 * more to check on, then, we retrieve a new composition final
5049Sis		 * table entry for the composite and then try to do the
5049Sis		 * composition again.
5049Sis		 */
5049Sis
5049Sis		if (match_not_found) {
5049Sis			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
5049Sis				i--;
5049Sis				goto SAVE_THE_CHAR;
5049Sis			}
5049Sis
5049Sis			saved_marks[saved_marks_count++] = i;
5049Sis		}
5049Sis
5049Sis		if (saved_l == l) {
5049Sis			while (i < last) {
5049Sis				if (blocked(comb_class, i + 1))
5049Sis					saved_marks[saved_marks_count++] = ++i;
5049Sis				else
5049Sis					break;
5049Sis			}
5049Sis			if (i < last) {
5049Sis				p = saved_p;
5049Sis				goto TRY_THE_NEXT_MARK;
5049Sis			}
5049Sis		} else if (i < last) {
5049Sis			p = find_composition_start(uv, t + saved_l,
5049Sis			    l - saved_l);
5049Sis			if (p != NULL) {
5049Sis				saved_p = p;
5049Sis				goto TRY_THE_NEXT_MARK;
5049Sis			}
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * There is no more composition possible.
5049Sis		 *
5049Sis		 * If there was no composition what so ever then we copy
5049Sis		 * over the original Starter and then append any non-Starters
5049Sis		 * remaining at the target string sequentially after that.
5049Sis		 */
5049Sis
5049Sis		if (saved_l == l) {
5049Sis			p = s + start[saved_i];
5049Sis			size = disp[saved_i];
5049Sis			for (j = 0; j < size; j++)
5049Sis				t[l++] = *p++;
5049Sis		}
5049Sis
5049Sis		for (k = 0; k < saved_marks_count; k++) {
5049Sis			p = s + start[saved_marks[k]];
5049Sis			size = disp[saved_marks[k]];
5049Sis			for (j = 0; j < size; j++)
5049Sis				t[l++] = *p++;
5049Sis		}
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * If the last character is a Starter and if we have a character
5049Sis	 * (possibly another Starter) that can be turned into a composite,
5049Sis	 * we do so and we do so until there is no more of composition
5049Sis	 * possible.
5049Sis	 */
5049Sis	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
5049Sis		p = *os;
5049Sis		saved_l = l - disp[last];
5049Sis
5049Sis		while (p < oslast) {
5049Sis			size = u8_number_of_bytes[*p];
5049Sis			if (size <= 1 || (p + size) > oslast)
5049Sis				break;
5049Sis
5049Sis			saved_p = p;
5049Sis
5049Sis			for (i = 0; i < size; i++)
5049Sis				tc[i] = *p++;
5049Sis
5049Sis			q = find_composition_start(uv, t + saved_l,
5049Sis			    l - saved_l);
5049Sis			if (q == NULL) {
5049Sis				p = saved_p;
5049Sis				break;
5049Sis			}
5049Sis
5049Sis			match_not_found = B_TRUE;
5049Sis
5049Sis			for (C = *q++; C > 0; C--) {
5049Sis				for (k = 0; k < size; q++, k++)
5049Sis					if (*q != tc[k])
5049Sis						break;
5049Sis
5049Sis				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
5049Sis					match_not_found = B_FALSE;
5049Sis
5049Sis					l = saved_l;
5049Sis
5049Sis					while (*++q != U8_TBL_ELEMENT_FILLER) {
5049Sis						/*
5049Sis						 * This is practically
5049Sis						 * impossible but we don't
5049Sis						 * want to take any chances.
5049Sis						 */
5049Sis						if (l >=
5049Sis						    U8_STREAM_SAFE_TEXT_MAX) {
5049Sis							p = saved_p;
5049Sis							goto SAFE_RETURN;
5049Sis						}
5049Sis						t[l++] = *q;
5049Sis					}
5049Sis
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				if (*q != U8_TBL_ELEMENT_FILLER)
5049Sis					while (*++q != U8_TBL_ELEMENT_FILLER)
5049Sis						;
5049Sis				while (*++q != U8_TBL_ELEMENT_FILLER)
5049Sis					;
5049Sis				q++;
5049Sis			}
5049Sis
5049Sis			if (match_not_found) {
5049Sis				p = saved_p;
5049Sis				break;
5049Sis			}
5049Sis		}
5049SisSAFE_RETURN:
5049Sis		*os = p;
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * Now we copy over the temporary string to the target string.
5049Sis	 * Since composition always reduces the number of characters or
5049Sis	 * the number of characters stay, we don't need to worry about
5049Sis	 * the buffer overflow here.
5049Sis	 */
5049Sis	for (i = 0; i < l; i++)
5049Sis		s[i] = t[i];
5049Sis	s[l] = '\0';
5049Sis
5049Sis	return (l);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The collect_a_seq() function checks on the given string s, collect
5049Sis * a sequence of characters at u8s, and return the sequence. While it collects
5049Sis * a sequence, it also applies case conversion, canonical or compatibility
5049Sis * decomposition, canonical decomposition, or some or all of them and
5049Sis * in that order.
5049Sis *
5049Sis * The collected sequence cannot be bigger than 32 characters since if
5049Sis * it is having more than 31 characters, the sequence will be terminated
5049Sis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
5049Sis * a Stream-Safe Text. The collected sequence is always terminated with
5049Sis * a null byte and the return value is the byte length of the sequence
5049Sis * including 0. The return value does not include the terminating
5049Sis * null byte.
5049Sis */
5049Sisstatic size_t
5049Siscollect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
5049Sis	boolean_t is_it_toupper,
5049Sis	boolean_t is_it_tolower,
5049Sis	boolean_t canonical_decomposition,
5049Sis	boolean_t compatibility_decomposition,
5049Sis	boolean_t canonical_composition,
*7012Sis	int *errnum, u8_normalization_states_t *state)
5049Sis{
5049Sis	uchar_t *s;
5049Sis	int sz;
5049Sis	int saved_sz;
5049Sis	size_t i;
5049Sis	size_t j;
5049Sis	size_t k;
5049Sis	size_t l;
5049Sis	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
5049Sis	uchar_t disp[U8_MAX_CHARS_A_SEQ];
5049Sis	uchar_t start[U8_MAX_CHARS_A_SEQ];
5049Sis	uchar_t u8t[U8_MB_CUR_MAX];
5049Sis	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
5049Sis	uchar_t tc;
5049Sis	size_t last;
5049Sis	size_t saved_last;
5049Sis	uint32_t u1;
5049Sis
5049Sis	/*
5049Sis	 * Save the source string pointer which we will return a changed
5049Sis	 * pointer if we do processing.
5049Sis	 */
5049Sis	s = *source;
5049Sis
5049Sis	/*
5049Sis	 * The following is a fallback for just in case callers are not
5049Sis	 * checking the string boundaries before the calling.
5049Sis	 */
5049Sis	if (s >= slast) {
5049Sis		u8s[0] = '\0';
5049Sis
5049Sis		return (0);
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * As the first thing, let's collect a character and do case
5049Sis	 * conversion if necessary.
5049Sis	 */
5049Sis
5049Sis	sz = u8_number_of_bytes[*s];
5049Sis
5049Sis	if (sz < 0) {
*7012Sis		*errnum = EILSEQ;
5049Sis
5049Sis		u8s[0] = *s++;
5049Sis		u8s[1] = '\0';
5049Sis
5049Sis		*source = s;
5049Sis
5049Sis		return (1);
5049Sis	}
5049Sis
5049Sis	if (sz == 1) {
5049Sis		if (is_it_toupper)
5049Sis			u8s[0] = U8_ASCII_TOUPPER(*s);
5049Sis		else if (is_it_tolower)
5049Sis			u8s[0] = U8_ASCII_TOLOWER(*s);
5049Sis		else
5049Sis			u8s[0] = *s;
5049Sis		s++;
5049Sis		u8s[1] = '\0';
5049Sis	} else if ((s + sz) > slast) {
*7012Sis		*errnum = EINVAL;
5049Sis
5049Sis		for (i = 0; s < slast; )
5049Sis			u8s[i++] = *s++;
5049Sis		u8s[i] = '\0';
5049Sis
5049Sis		*source = s;
5049Sis
5049Sis		return (i);
5049Sis	} else {
5049Sis		if (is_it_toupper || is_it_tolower) {
5049Sis			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
5049Sis			s += sz;
5049Sis			sz = i;
5049Sis		} else {
5049Sis			for (i = 0; i < sz; )
5049Sis				u8s[i++] = *s++;
5049Sis			u8s[i] = '\0';
5049Sis		}
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * And then canonical/compatibility decomposition followed by
5049Sis	 * an optional canonical composition. Please be noted that
5049Sis	 * canonical composition is done only when a decomposition is
5049Sis	 * done.
5049Sis	 */
5049Sis	if (canonical_decomposition || compatibility_decomposition) {
5049Sis		if (sz == 1) {
5049Sis			*state = U8_STATE_START;
5049Sis
5049Sis			saved_sz = 1;
5049Sis
5049Sis			comb_class[0] = 0;
5049Sis			start[0] = 0;
5049Sis			disp[0] = 1;
5049Sis
5049Sis			last = 1;
5049Sis		} else {
5049Sis			saved_sz = do_decomp(uv, u8s, u8s, sz,
5049Sis			    canonical_decomposition, state);
5049Sis
5049Sis			last = 0;
5049Sis
5049Sis			for (i = 0; i < saved_sz; ) {
5049Sis				sz = u8_number_of_bytes[u8s[i]];
5049Sis
5049Sis				comb_class[last] = combining_class(uv,
5049Sis				    u8s + i, sz);
5049Sis				start[last] = i;
5049Sis				disp[last] = sz;
5049Sis
5049Sis				last++;
5049Sis				i += sz;
5049Sis			}
5049Sis
5049Sis			/*
5049Sis			 * Decomposition yields various Hangul related
5049Sis			 * states but not on combining marks. We need to
5049Sis			 * find out at here by checking on the last
5049Sis			 * character.
5049Sis			 */
5049Sis			if (*state == U8_STATE_START) {
5049Sis				if (comb_class[last - 1])
5049Sis					*state = U8_STATE_COMBINING_MARK;
5049Sis			}
5049Sis		}
5049Sis
5049Sis		saved_last = last;
5049Sis
5049Sis		while (s < slast) {
5049Sis			sz = u8_number_of_bytes[*s];
5049Sis
5049Sis			/*
5049Sis			 * If this is an illegal character, an incomplete
5049Sis			 * character, or an 7-bit ASCII Starter character,
5049Sis			 * then we have collected a sequence; break and let
5049Sis			 * the next call deal with the two cases.
5049Sis			 *
5049Sis			 * Note that this is okay only if you are using this
5049Sis			 * function with a fixed length string, not on
5049Sis			 * a buffer with multiple calls of one chunk at a time.
5049Sis			 */
5049Sis			if (sz <= 1) {
5049Sis				break;
5049Sis			} else if ((s + sz) > slast) {
5049Sis				break;
5049Sis			} else {
5049Sis				/*
5049Sis				 * If the previous character was a Hangul Jamo
5049Sis				 * and this character is a Hangul Jamo that
5049Sis				 * can be conjoined, we collect the Jamo.
5049Sis				 */
5049Sis				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
5049Sis					U8_PUT_3BYTES_INTO_UTF32(u1,
5049Sis					    *s, *(s + 1), *(s + 2));
5049Sis
5049Sis					if (U8_HANGUL_COMPOSABLE_L_V(*state,
5049Sis					    u1)) {
5049Sis						i = 0;
5049Sis						*state = U8_STATE_HANGUL_LV;
5049Sis						goto COLLECT_A_HANGUL;
5049Sis					}
5049Sis
5049Sis					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
5049Sis					    u1)) {
5049Sis						i = 0;
5049Sis						*state = U8_STATE_HANGUL_LVT;
5049Sis						goto COLLECT_A_HANGUL;
5049Sis					}
5049Sis				}
5049Sis
5049Sis				/*
5049Sis				 * Regardless of whatever it was, if this is
5049Sis				 * a Starter, we don't collect the character
5049Sis				 * since that's a new start and we will deal
5049Sis				 * with it at the next time.
5049Sis				 */
5049Sis				i = combining_class(uv, s, sz);
5049Sis				if (i == U8_COMBINING_CLASS_STARTER)
5049Sis					break;
5049Sis
5049Sis				/*
5049Sis				 * We know the current character is a combining
5049Sis				 * mark. If the previous character wasn't
5049Sis				 * a Starter (not Hangul) or a combining mark,
5049Sis				 * then, we don't collect this combining mark.
5049Sis				 */
5049Sis				if (*state != U8_STATE_START &&
5049Sis				    *state != U8_STATE_COMBINING_MARK)
5049Sis					break;
5049Sis
5049Sis				*state = U8_STATE_COMBINING_MARK;
5049SisCOLLECT_A_HANGUL:
5049Sis				/*
5049Sis				 * If we collected a Starter and combining
5049Sis				 * marks up to 30, i.e., total 31 characters,
5049Sis				 * then, we terminate this degenerately long
5049Sis				 * combining sequence with a U+034F COMBINING
5049Sis				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
5049Sis				 * UTF-8 and turn this into a Stream-Safe
5049Sis				 * Text. This will be extremely rare but
5049Sis				 * possible.
5049Sis				 *
5049Sis				 * The following will also guarantee that
5049Sis				 * we are not writing more than 32 characters
5049Sis				 * plus a NULL at u8s[].
5049Sis				 */
5049Sis				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
5049SisTURN_STREAM_SAFE:
5049Sis					*state = U8_STATE_START;
5049Sis					comb_class[last] = 0;
5049Sis					start[last] = saved_sz;
5049Sis					disp[last] = 2;
5049Sis					last++;
5049Sis
5049Sis					u8s[saved_sz++] = 0xCD;
5049Sis					u8s[saved_sz++] = 0x8F;
5049Sis
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				/*
5049Sis				 * Some combining marks also do decompose into
5049Sis				 * another combining mark or marks.
5049Sis				 */
5049Sis				if (*state == U8_STATE_COMBINING_MARK) {
5049Sis					k = last;
5049Sis					l = sz;
5049Sis					i = do_decomp(uv, uts, s, sz,
5049Sis					    canonical_decomposition, state);
5049Sis					for (j = 0; j < i; ) {
5049Sis						sz = u8_number_of_bytes[uts[j]];
5049Sis
5049Sis						comb_class[last] =
5049Sis						    combining_class(uv,
5049Sis						    uts + j, sz);
5049Sis						start[last] = saved_sz + j;
5049Sis						disp[last] = sz;
5049Sis
5049Sis						last++;
5049Sis						if (last >=
5049Sis						    U8_UPPER_LIMIT_IN_A_SEQ) {
5049Sis							last = k;
5049Sis							goto TURN_STREAM_SAFE;
5049Sis						}
5049Sis						j += sz;
5049Sis					}
5049Sis
5049Sis					*state = U8_STATE_COMBINING_MARK;
5049Sis					sz = i;
5049Sis					s += l;
5049Sis
5049Sis					for (i = 0; i < sz; i++)
5049Sis						u8s[saved_sz++] = uts[i];
5049Sis				} else {
5049Sis					comb_class[last] = i;
5049Sis					start[last] = saved_sz;
5049Sis					disp[last] = sz;
5049Sis					last++;
5049Sis
5049Sis					for (i = 0; i < sz; i++)
5049Sis						u8s[saved_sz++] = *s++;
5049Sis				}
5049Sis
5049Sis				/*
5049Sis				 * If this is U+0345 COMBINING GREEK
5049Sis				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
5049Sis				 * iota subscript, and need to be converted to
5049Sis				 * uppercase letter, convert it to U+0399 GREEK
5049Sis				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
5049Sis				 * i.e., convert to capital adscript form as
5049Sis				 * specified in the Unicode standard.
5049Sis				 *
5049Sis				 * This is the only special case of (ambiguous)
5049Sis				 * case conversion at combining marks and
5049Sis				 * probably the standard will never have
5049Sis				 * anything similar like this in future.
5049Sis				 */
5049Sis				if (is_it_toupper && sz >= 2 &&
5049Sis				    u8s[saved_sz - 2] == 0xCD &&
5049Sis				    u8s[saved_sz - 1] == 0x85) {
5049Sis					u8s[saved_sz - 2] = 0xCE;
5049Sis					u8s[saved_sz - 1] = 0x99;
5049Sis				}
5049Sis			}
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * Let's try to ensure a canonical ordering for the collected
5049Sis		 * combining marks. We do this only if we have collected
5049Sis		 * at least one more non-Starter. (The decomposition mapping
5049Sis		 * data tables have fully (and recursively) expanded and
5049Sis		 * canonically ordered decompositions.)
5049Sis		 *
5049Sis		 * The U8_SWAP_COMB_MARKS() convenience macro has some
5049Sis		 * assumptions and we are meeting the assumptions.
5049Sis		 */
5049Sis		last--;
5049Sis		if (last >= saved_last) {
5049Sis			for (i = 0; i < last; i++)
5049Sis				for (j = last; j > i; j--)
5049Sis					if (comb_class[j] &&
5049Sis					    comb_class[j - 1] > comb_class[j]) {
5049Sis						U8_SWAP_COMB_MARKS(j - 1, j);
5049Sis					}
5049Sis		}
5049Sis
5049Sis		*source = s;
5049Sis
5049Sis		if (! canonical_composition) {
5049Sis			u8s[saved_sz] = '\0';
5049Sis			return (saved_sz);
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * Now do the canonical composition. Note that we do this
5049Sis		 * only after a canonical or compatibility decomposition to
5049Sis		 * finish up NFC or NFKC.
5049Sis		 */
5049Sis		sz = do_composition(uv, u8s, comb_class, start, disp, last,
5049Sis		    &s, slast);
5049Sis	}
5049Sis
5049Sis	*source = s;
5049Sis
5049Sis	return ((size_t)sz);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The do_norm_compare() function does string comparion based on Unicode
5049Sis * simple case mappings and Unicode Normalization definitions.
5049Sis *
5049Sis * It does so by collecting a sequence of character at a time and comparing
5049Sis * the collected sequences from the strings.
5049Sis *
5049Sis * The meanings on the return values are the same as the usual strcmp().
5049Sis */
5049Sisstatic int
5049Sisdo_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
*7012Sis	int flag, int *errnum)
5049Sis{
5049Sis	int result;
5049Sis	size_t sz1;
5049Sis	size_t sz2;
5049Sis	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
5049Sis	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
5049Sis	uchar_t *s1last;
5049Sis	uchar_t *s2last;
5049Sis	boolean_t is_it_toupper;
5049Sis	boolean_t is_it_tolower;
5049Sis	boolean_t canonical_decomposition;
5049Sis	boolean_t compatibility_decomposition;
5049Sis	boolean_t canonical_composition;
5049Sis	u8_normalization_states_t state;
5049Sis
5049Sis	s1last = s1 + n1;
5049Sis	s2last = s2 + n2;
5049Sis
5049Sis	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
5049Sis	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
5049Sis	canonical_decomposition = flag & U8_CANON_DECOMP;
5049Sis	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
5049Sis	canonical_composition = flag & U8_CANON_COMP;
5049Sis
5049Sis	while (s1 < s1last && s2 < s2last) {
5049Sis		/*
5049Sis		 * If the current character is a 7-bit ASCII and the last
5049Sis		 * character, or, if the current character and the next
5049Sis		 * character are both some 7-bit ASCII characters then
5049Sis		 * we treat the current character as a sequence.
5049Sis		 *
5049Sis		 * In any other cases, we need to call collect_a_seq().
5049Sis		 */
5049Sis
5049Sis		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
5049Sis		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
5049Sis			if (is_it_toupper)
5049Sis				u8s1[0] = U8_ASCII_TOUPPER(*s1);
5049Sis			else if (is_it_tolower)
5049Sis				u8s1[0] = U8_ASCII_TOLOWER(*s1);
5049Sis			else
5049Sis				u8s1[0] = *s1;
5049Sis			u8s1[1] = '\0';
5049Sis			sz1 = 1;
5049Sis			s1++;
5049Sis		} else {
5049Sis			state = U8_STATE_START;
5049Sis			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
5049Sis			    is_it_toupper, is_it_tolower,
5049Sis			    canonical_decomposition,
5049Sis			    compatibility_decomposition,
*7012Sis			    canonical_composition, errnum, &state);
5049Sis		}
5049Sis
5049Sis		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
5049Sis		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
5049Sis			if (is_it_toupper)
5049Sis				u8s2[0] = U8_ASCII_TOUPPER(*s2);
5049Sis			else if (is_it_tolower)
5049Sis				u8s2[0] = U8_ASCII_TOLOWER(*s2);
5049Sis			else
5049Sis				u8s2[0] = *s2;
5049Sis			u8s2[1] = '\0';
5049Sis			sz2 = 1;
5049Sis			s2++;
5049Sis		} else {
5049Sis			state = U8_STATE_START;
5049Sis			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
5049Sis			    is_it_toupper, is_it_tolower,
5049Sis			    canonical_decomposition,
5049Sis			    compatibility_decomposition,
*7012Sis			    canonical_composition, errnum, &state);
5049Sis		}
5049Sis
5049Sis		/*
5049Sis		 * Now compare the two characters. If they are the same,
5049Sis		 * we move on to the next character sequences.
5049Sis		 */
5049Sis		if (sz1 == 1 && sz2 == 1) {
5049Sis			if (*u8s1 > *u8s2)
5049Sis				return (1);
5049Sis			if (*u8s1 < *u8s2)
5049Sis				return (-1);
5049Sis		} else {
5049Sis			result = strcmp((const char *)u8s1, (const char *)u8s2);
5049Sis			if (result != 0)
5049Sis				return (result);
5049Sis		}
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * We compared until the end of either or both strings.
5049Sis	 *
5049Sis	 * If we reached to or went over the ends for the both, that means
5049Sis	 * they are the same.
5049Sis	 *
5049Sis	 * If we reached only one end, that means the other string has
5049Sis	 * something which then can be used to determine the return value.
5049Sis	 */
5049Sis	if (s1 >= s1last) {
5049Sis		if (s2 >= s2last)
5049Sis			return (0);
5049Sis		return (-1);
5049Sis	}
5049Sis	return (1);
5049Sis}
5049Sis
5049Sis/*
5049Sis * The u8_strcmp() function compares two UTF-8 strings quite similar to
5049Sis * the strcmp(). For the comparison, however, Unicode Normalization specific
5049Sis * equivalency and Unicode simple case conversion mappings based equivalency
5049Sis * can be requested and checked against.
5049Sis */
5049Sisint
5049Sisu8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
*7012Sis		int *errnum)
5049Sis{
5049Sis	int f;
5049Sis	size_t n1;
5049Sis	size_t n2;
5049Sis
*7012Sis	*errnum = 0;
5049Sis
5049Sis	/*
5049Sis	 * Check on the requested Unicode version, case conversion, and
5049Sis	 * normalization flag values.
5049Sis	 */
5049Sis
5049Sis	if (uv > U8_UNICODE_LATEST) {
*7012Sis		*errnum = ERANGE;
5049Sis		uv = U8_UNICODE_LATEST;
5049Sis	}
5049Sis
5049Sis	if (flag == 0) {
5049Sis		flag = U8_STRCMP_CS;
5049Sis	} else {
5049Sis		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
5049Sis		    U8_STRCMP_CI_LOWER);
5049Sis		if (f == 0) {
5049Sis			flag |= U8_STRCMP_CS;
5049Sis		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
5049Sis		    f != U8_STRCMP_CI_LOWER) {
*7012Sis			*errnum = EBADF;
5049Sis			flag = U8_STRCMP_CS;
5049Sis		}
5049Sis
5049Sis		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
5049Sis		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
5049Sis		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
*7012Sis			*errnum = EBADF;
5049Sis			flag = U8_STRCMP_CS;
5049Sis		}
5049Sis	}
5049Sis
5049Sis	if (flag == U8_STRCMP_CS) {
5049Sis		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
5049Sis	}
5049Sis
5049Sis	n1 = strlen(s1);
5049Sis	n2 = strlen(s2);
5049Sis	if (n != 0) {
5049Sis		if (n < n1)
5049Sis			n1 = n;
5049Sis		if (n < n2)
5049Sis			n2 = n;
5049Sis	}
5049Sis
5049Sis	/*
5049Sis	 * Simple case conversion can be done much faster and so we do
5049Sis	 * them separately here.
5049Sis	 */
5049Sis	if (flag == U8_STRCMP_CI_UPPER) {
5049Sis		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
*7012Sis		    n1, n2, B_TRUE, errnum));
5049Sis	} else if (flag == U8_STRCMP_CI_LOWER) {
5049Sis		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
*7012Sis		    n1, n2, B_FALSE, errnum));
5049Sis	}
5049Sis
5049Sis	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
*7012Sis	    flag, errnum));
5049Sis}
5049Sis
5049Sissize_t
5049Sisu8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
*7012Sis	int flag, size_t unicode_version, int *errnum)
5049Sis{
5049Sis	int f;
5049Sis	int sz;
5049Sis	uchar_t *ib;
5049Sis	uchar_t *ibtail;
5049Sis	uchar_t *ob;
5049Sis	uchar_t *obtail;
5049Sis	boolean_t do_not_ignore_null;
5049Sis	boolean_t do_not_ignore_invalid;
5049Sis	boolean_t is_it_toupper;
5049Sis	boolean_t is_it_tolower;
5049Sis	boolean_t canonical_decomposition;
5049Sis	boolean_t compatibility_decomposition;
5049Sis	boolean_t canonical_composition;
5049Sis	size_t ret_val;
5049Sis	size_t i;
5049Sis	size_t j;
5049Sis	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
5049Sis	u8_normalization_states_t state;
5049Sis
5049Sis	if (unicode_version > U8_UNICODE_LATEST) {
*7012Sis		*errnum = ERANGE;
5049Sis		return ((size_t)-1);
5049Sis	}
5049Sis
5049Sis	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
5049Sis	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
*7012Sis		*errnum = EBADF;
5049Sis		return ((size_t)-1);
5049Sis	}
5049Sis
5049Sis	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
5049Sis	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
5049Sis	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
*7012Sis		*errnum = EBADF;
5049Sis		return ((size_t)-1);
5049Sis	}
5049Sis
5049Sis	if (inarray == NULL || *inlen == 0)
5049Sis		return (0);
5049Sis
5049Sis	if (outarray == NULL) {
*7012Sis		*errnum = E2BIG;
5049Sis		return ((size_t)-1);
5049Sis	}
5049Sis
5049Sis	ib = (uchar_t *)inarray;
5049Sis	ob = (uchar_t *)outarray;
5049Sis	ibtail = ib + *inlen;
5049Sis	obtail = ob + *outlen;
5049Sis
5049Sis	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
5049Sis	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
5049Sis	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
5049Sis	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
5049Sis
5049Sis	ret_val = 0;
5049Sis
5049Sis	/*
5049Sis	 * If we don't have a normalization flag set, we do the simple case
5049Sis	 * conversion based text preparation separately below. Text
5049Sis	 * preparation involving Normalization will be done in the false task
5049Sis	 * block, again, separately since it will take much more time and
5049Sis	 * resource than doing simple case conversions.
5049Sis	 */
5049Sis	if (f == 0) {
5049Sis		while (ib < ibtail) {
5049Sis			if (*ib == '\0' && do_not_ignore_null)
5049Sis				break;
5049Sis
5049Sis			sz = u8_number_of_bytes[*ib];
5049Sis
5049Sis			if (sz < 0) {
5049Sis				if (do_not_ignore_invalid) {
*7012Sis					*errnum = EILSEQ;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				sz = 1;
5049Sis				ret_val++;
5049Sis			}
5049Sis
5049Sis			if (sz == 1) {
5049Sis				if (ob >= obtail) {
*7012Sis					*errnum = E2BIG;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				if (is_it_toupper)
5049Sis					*ob = U8_ASCII_TOUPPER(*ib);
5049Sis				else if (is_it_tolower)
5049Sis					*ob = U8_ASCII_TOLOWER(*ib);
5049Sis				else
5049Sis					*ob = *ib;
5049Sis				ib++;
5049Sis				ob++;
5049Sis			} else if ((ib + sz) > ibtail) {
5049Sis				if (do_not_ignore_invalid) {
*7012Sis					*errnum = EINVAL;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				if ((obtail - ob) < (ibtail - ib)) {
*7012Sis					*errnum = E2BIG;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				/*
5049Sis				 * We treat the remaining incomplete character
5049Sis				 * bytes as a character.
5049Sis				 */
5049Sis				ret_val++;
5049Sis
5049Sis				while (ib < ibtail)
5049Sis					*ob++ = *ib++;
5049Sis			} else {
5049Sis				if (is_it_toupper || is_it_tolower) {
5049Sis					i = do_case_conv(unicode_version, u8s,
5049Sis					    ib, sz, is_it_toupper);
5049Sis
5049Sis					if ((obtail - ob) < i) {
*7012Sis						*errnum = E2BIG;
5049Sis						ret_val = (size_t)-1;
5049Sis						break;
5049Sis					}
5049Sis
5049Sis					ib += sz;
5049Sis
5049Sis					for (sz = 0; sz < i; sz++)
5049Sis						*ob++ = u8s[sz];
5049Sis				} else {
5049Sis					if ((obtail - ob) < sz) {
*7012Sis						*errnum = E2BIG;
5049Sis						ret_val = (size_t)-1;
5049Sis						break;
5049Sis					}
5049Sis
5049Sis					for (i = 0; i < sz; i++)
5049Sis						*ob++ = *ib++;
5049Sis				}
5049Sis			}
5049Sis		}
5049Sis	} else {
5049Sis		canonical_decomposition = flag & U8_CANON_DECOMP;
5049Sis		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
5049Sis		canonical_composition = flag & U8_CANON_COMP;
5049Sis
5049Sis		while (ib < ibtail) {
5049Sis			if (*ib == '\0' && do_not_ignore_null)
5049Sis				break;
5049Sis
5049Sis			/*
5049Sis			 * If the current character is a 7-bit ASCII
5049Sis			 * character and it is the last character, or,
5049Sis			 * if the current character is a 7-bit ASCII
5049Sis			 * character and the next character is also a 7-bit
5049Sis			 * ASCII character, then, we copy over this
5049Sis			 * character without going through collect_a_seq().
5049Sis			 *
5049Sis			 * In any other cases, we need to look further with
5049Sis			 * the collect_a_seq() function.
5049Sis			 */
5049Sis			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
5049Sis			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
5049Sis				if (ob >= obtail) {
*7012Sis					*errnum = E2BIG;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				if (is_it_toupper)
5049Sis					*ob = U8_ASCII_TOUPPER(*ib);
5049Sis				else if (is_it_tolower)
5049Sis					*ob = U8_ASCII_TOLOWER(*ib);
5049Sis				else
5049Sis					*ob = *ib;
5049Sis				ib++;
5049Sis				ob++;
5049Sis			} else {
*7012Sis				*errnum = 0;
5049Sis				state = U8_STATE_START;
5049Sis
5049Sis				j = collect_a_seq(unicode_version, u8s,
5049Sis				    &ib, ibtail,
5049Sis				    is_it_toupper,
5049Sis				    is_it_tolower,
5049Sis				    canonical_decomposition,
5049Sis				    compatibility_decomposition,
5049Sis				    canonical_composition,
*7012Sis				    errnum, &state);
5049Sis
*7012Sis				if (*errnum && do_not_ignore_invalid) {
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				if ((obtail - ob) < j) {
*7012Sis					*errnum = E2BIG;
5049Sis					ret_val = (size_t)-1;
5049Sis					break;
5049Sis				}
5049Sis
5049Sis				for (i = 0; i < j; i++)
5049Sis					*ob++ = u8s[i];
5049Sis			}
5049Sis		}
5049Sis	}
5049Sis
5049Sis	*inlen = ibtail - ib;
5049Sis	*outlen = obtail - ob;
5049Sis
5049Sis	return (ret_val);
5049Sis}