common/unicode/u8_textprep.c

185029Spjd/*
185029Spjd * CDDL HEADER START
185029Spjd *
185029Spjd * The contents of this file are subject to the terms of the
185029Spjd * Common Development and Distribution License (the "License").
185029Spjd * You may not use this file except in compliance with the License.
185029Spjd *
185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
185029Spjd * or http://www.opensolaris.org/os/licensing.
185029Spjd * See the License for the specific language governing permissions
185029Spjd * and limitations under the License.
185029Spjd *
185029Spjd * When distributing Covered Code, include this CDDL HEADER in each
185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
185029Spjd * If applicable, add the following below this CDDL HEADER, with the
185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying
185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
185029Spjd *
185029Spjd * CDDL HEADER END
185029Spjd */
185029Spjd/*
185029Spjd * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
185029Spjd * Use is subject to license terms.
185029Spjd */
185029Spjd
185029Spjd
185029Spjd
185029Spjd/*
185029Spjd * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
185029Spjd *
185029Spjd * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
185029Spjd * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
185029Spjd * the section 3C man pages.
185029Spjd * Interface stability: Committed.
185029Spjd */
185029Spjd
185029Spjd#include <sys/types.h>
185029Spjd#ifdef	_KERNEL
185029Spjd#include <sys/param.h>
185029Spjd#include <sys/sysmacros.h>
185029Spjd#include <sys/systm.h>
185029Spjd#include <sys/debug.h>
185029Spjd#include <sys/kmem.h>
219089Spjd#include <sys/sunddi.h>
185029Spjd#else
185029Spjd#include <strings.h>
185029Spjd#endif	/* _KERNEL */
185029Spjd#include <sys/byteorder.h>
185029Spjd#include <sys/errno.h>
185029Spjd#include <sys/u8_textprep.h>
185029Spjd#include <sys/u8_textprep_data.h>
185029Spjd
185029Spjd
185029Spjd/* The maximum possible number of bytes in a UTF-8 character. */
185029Spjd#define	U8_MB_CUR_MAX			(4)
185029Spjd
185029Spjd/*
185029Spjd * The maximum number of bytes needed for a UTF-8 character to cover
185029Spjd * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
185029Spjd */
185029Spjd#define	U8_MAX_BYTES_UCS2		(3)
185029Spjd
185029Spjd/* The maximum possible number of bytes in a Stream-Safe Text. */
185029Spjd#define	U8_STREAM_SAFE_TEXT_MAX		(128)
185029Spjd
185029Spjd/*
185029Spjd * The maximum number of characters in a combining/conjoining sequence and
185029Spjd * the actual upperbound limit of a combining/conjoining sequence.
185029Spjd */
185029Spjd#define	U8_MAX_CHARS_A_SEQ		(32)
185029Spjd#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
185029Spjd
185029Spjd/* The combining class value for Starter. */
185029Spjd#define	U8_COMBINING_CLASS_STARTER	(0)
185029Spjd
185029Spjd/*
185029Spjd * Some Hangul related macros at below.
185029Spjd *
185029Spjd * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
185029Spjd * Vowels, and optional Trailing consonants in Unicode scalar values.
185029Spjd *
185029Spjd * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
185029Spjd * the actual U+11A8. This is due to that the trailing consonant is optional
185029Spjd * and thus we are doing a pre-calculation of subtracting one.
185029Spjd *
185029Spjd * Each of 19 modern leading consonants has total 588 possible syllables since
185029Spjd * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
185029Spjd * no trailing consonant case, i.e., 21 x 28 = 588.
185029Spjd *
185029Spjd * We also have bunch of Hangul related macros at below. Please bear in mind
185029Spjd * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
185029Spjd * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
185029Spjd * Jamo; it just guarantee that it will be most likely.
185029Spjd */
185029Spjd#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
185029Spjd#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
185029Spjd#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
185029Spjd#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
185029Spjd#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
185029Spjd#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
185029Spjd#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
185029Spjd
185029Spjd#define	U8_HANGUL_V_COUNT		(21)
185029Spjd#define	U8_HANGUL_VT_COUNT		(588)
185029Spjd#define	U8_HANGUL_T_COUNT		(28)
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
185029Spjd
185029Spjd#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
185029Spjd	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
185029Spjd	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
185029Spjd	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO_L(u) \
185029Spjd	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO_V(u) \
185029Spjd	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO_T(u) \
185029Spjd	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
185029Spjd
185029Spjd#define	U8_HANGUL_JAMO(u) \
185029Spjd	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
185029Spjd
185029Spjd#define	U8_HANGUL_SYLLABLE(u) \
185029Spjd	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
185029Spjd
185029Spjd#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
185029Spjd	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
185029Spjd
185029Spjd#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
185029Spjd	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
185029Spjd
185029Spjd/* The types of decomposition mappings. */
185029Spjd#define	U8_DECOMP_BOTH			(0xF5U)
185029Spjd#define	U8_DECOMP_CANONICAL		(0xF6U)
185029Spjd
185029Spjd/* The indicator for 16-bit table. */
185029Spjd#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
185029Spjd
185029Spjd/* The following are some convenience macros. */
268014Spfg#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3)  \
268014Spfg	(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
268014Spfg		(((uint32_t)(b2) & 0x3F) << 6)  | \
268014Spfg		((uint32_t)(b3) & 0x3F));
185029Spjd#define	U8_SIMPLE_SWAP(a, b, t) \
185029Spjd	(t) = (a); \
185029Spjd	(a) = (b); \
185029Spjd	(b) = (t);
185029Spjd
185029Spjd#define	U8_ASCII_TOUPPER(c) \
185029Spjd	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
185029Spjd
185029Spjd#define	U8_ASCII_TOLOWER(c) \
185029Spjd	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
185029Spjd
185029Spjd#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
185029Spjd/*
185029Spjd * The following macro assumes that the two characters that are to be
185029Spjd * swapped are adjacent to each other and 'a' comes before 'b'.
185029Spjd *
185029Spjd * If the assumptions are not met, then, the macro will fail.
185029Spjd */
185029Spjd#define	U8_SWAP_COMB_MARKS(a, b) \
185029Spjd	for (k = 0; k < disp[(a)]; k++) \
185029Spjd		u8t[k] = u8s[start[(a)] + k]; \
185029Spjd	for (k = 0; k < disp[(b)]; k++) \
185029Spjd		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
185029Spjd	start[(b)] = start[(a)] + disp[(b)]; \
185029Spjd	for (k = 0; k < disp[(a)]; k++) \
185029Spjd		u8s[start[(b)] + k] = u8t[k]; \
185029Spjd	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
185029Spjd	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
185029Spjd
185029Spjd/* The possible states during normalization. */
185029Spjdtypedef enum {
185029Spjd	U8_STATE_START = 0,
185029Spjd	U8_STATE_HANGUL_L = 1,
185029Spjd	U8_STATE_HANGUL_LV = 2,
185029Spjd	U8_STATE_HANGUL_LVT = 3,
185029Spjd	U8_STATE_HANGUL_V = 4,
185029Spjd	U8_STATE_HANGUL_T = 5,
185029Spjd	U8_STATE_COMBINING_MARK = 6
185029Spjd} u8_normalization_states_t;
185029Spjd
185029Spjd/*
185029Spjd * The three vectors at below are used to check bytes of a given UTF-8
185029Spjd * character are valid and not containing any malformed byte values.
185029Spjd *
185029Spjd * We used to have a quite relaxed UTF-8 binary representation but then there
185029Spjd * was some security related issues and so the Unicode Consortium defined
185029Spjd * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
185029Spjd * one more time at the Unicode 3.2. The following three tables are based on
185029Spjd * that.
185029Spjd */
185029Spjd
185029Spjd#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
185029Spjd
185029Spjd#define	I_				U8_ILLEGAL_CHAR
185029Spjd#define	O_				U8_OUT_OF_RANGE_CHAR
185029Spjd
185029Spjdconst int8_t u8_number_of_bytes[0x100] = {
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
185029Spjd
185029Spjd/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
185029Spjd	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
185029Spjd
268014Spfg/*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
185029Spjd	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
185029Spjd
268014Spfg/*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
185029Spjd	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
185029Spjd
185029Spjd/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
185029Spjd	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
185029Spjd
185029Spjd/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
185029Spjd	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
185029Spjd
185029Spjd/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
185029Spjd	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
185029Spjd
185029Spjd/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
185029Spjd	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
185029Spjd
185029Spjd/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
185029Spjd	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
185029Spjd};
185029Spjd
185029Spjd#undef	I_
185029Spjd#undef	O_
185029Spjd
185029Spjdconst uint8_t u8_valid_min_2nd_byte[0x100] = {
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd/*	C0    C1    C2    C3    C4    C5    C6    C7    */
185029Spjd	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	C8    C9    CA    CB    CC    CD    CE    CF    */
185029Spjd	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	D0    D1    D2    D3    D4    D5    D6    D7    */
185029Spjd	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	D8    D9    DA    DB    DC    DD    DE    DF    */
185029Spjd	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	E0    E1    E2    E3    E4    E5    E6    E7    */
185029Spjd	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	E8    E9    EA    EB    EC    ED    EE    EF    */
185029Spjd	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185029Spjd/*	F0    F1    F2    F3    F4    F5    F6    F7    */
185029Spjd	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd};
185029Spjd
185029Spjdconst uint8_t u8_valid_max_2nd_byte[0x100] = {
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd/*	C0    C1    C2    C3    C4    C5    C6    C7    */
185029Spjd	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
185029Spjd/*	C8    C9    CA    CB    CC    CD    CE    CF    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
185029Spjd/*	D0    D1    D2    D3    D4    D5    D6    D7    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
185029Spjd/*	D8    D9    DA    DB    DC    DD    DE    DF    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
185029Spjd/*	E0    E1    E2    E3    E4    E5    E6    E7    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
185029Spjd/*	E8    E9    EA    EB    EC    ED    EE    EF    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
185029Spjd/*	F0    F1    F2    F3    F4    F5    F6    F7    */
185029Spjd	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
185029Spjd	0,    0,    0,    0,    0,    0,    0,    0,
185029Spjd};
185029Spjd
185029Spjd
185029Spjd/*
185029Spjd * The u8_validate() validates on the given UTF-8 character string and
185029Spjd * calculate the byte length. It is quite similar to mblen(3C) except that
185029Spjd * this will validate against the list of characters if required and
185029Spjd * specific to UTF-8 and Unicode.
185029Spjd */
185029Spjdint
185029Spjdu8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
185029Spjd{
185029Spjd	uchar_t *ib;
185029Spjd	uchar_t *ibtail;
185029Spjd	uchar_t **p;
185029Spjd	uchar_t *s1;
185029Spjd	uchar_t *s2;
185029Spjd	uchar_t f;
185029Spjd	int sz;
185029Spjd	size_t i;
185029Spjd	int ret_val;
185029Spjd	boolean_t second;
185029Spjd	boolean_t no_need_to_validate_entire;
185029Spjd	boolean_t check_additional;
185029Spjd	boolean_t validate_ucs2_range_only;
185029Spjd
185029Spjd	if (! u8str)
185029Spjd		return (0);
185029Spjd
185029Spjd	ib = (uchar_t *)u8str;
185029Spjd	ibtail = ib + n;
185029Spjd
185029Spjd	ret_val = 0;
185029Spjd
185029Spjd	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
185029Spjd	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
185029Spjd	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
185029Spjd
185029Spjd	while (ib < ibtail) {
185029Spjd		/*
185029Spjd		 * The first byte of a UTF-8 character tells how many
185029Spjd		 * bytes will follow for the character. If the first byte
185029Spjd		 * is an illegal byte value or out of range value, we just
185029Spjd		 * return -1 with an appropriate error number.
185029Spjd		 */
185029Spjd		sz = u8_number_of_bytes[*ib];
185029Spjd		if (sz == U8_ILLEGAL_CHAR) {
185029Spjd			*errnum = EILSEQ;
185029Spjd			return (-1);
185029Spjd		}
185029Spjd
185029Spjd		if (sz == U8_OUT_OF_RANGE_CHAR ||
185029Spjd		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
185029Spjd			*errnum = ERANGE;
185029Spjd			return (-1);
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * If we don't have enough bytes to check on, that's also
185029Spjd		 * an error. As you can see, we give illegal byte sequence
185029Spjd		 * checking higher priority then EINVAL cases.
185029Spjd		 */
185029Spjd		if ((ibtail - ib) < sz) {
185029Spjd			*errnum = EINVAL;
185029Spjd			return (-1);
185029Spjd		}
185029Spjd
185029Spjd		if (sz == 1) {
185029Spjd			ib++;
185029Spjd			ret_val++;
185029Spjd		} else {
185029Spjd			/*
185029Spjd			 * Check on the multi-byte UTF-8 character. For more
185029Spjd			 * details on this, see comment added for the used
185029Spjd			 * data structures at the beginning of the file.
185029Spjd			 */
185029Spjd			f = *ib++;
185029Spjd			ret_val++;
185029Spjd			second = B_TRUE;
185029Spjd			for (i = 1; i < sz; i++) {
185029Spjd				if (second) {
185029Spjd					if (*ib < u8_valid_min_2nd_byte[f] ||
185029Spjd					    *ib > u8_valid_max_2nd_byte[f]) {
185029Spjd						*errnum = EILSEQ;
185029Spjd						return (-1);
185029Spjd					}
185029Spjd					second = B_FALSE;
185029Spjd				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
185029Spjd					*errnum = EILSEQ;
185029Spjd					return (-1);
185029Spjd				}
185029Spjd				ib++;
185029Spjd				ret_val++;
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		if (check_additional) {
185029Spjd			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
185029Spjd				s1 = ib - sz;
185029Spjd				s2 = p[i];
185029Spjd				while (s1 < ib) {
185029Spjd					if (*s1 != *s2 || *s2 == '\0')
185029Spjd						break;
185029Spjd					s1++;
185029Spjd					s2++;
185029Spjd				}
185029Spjd
185029Spjd				if (s1 >= ib && *s2 == '\0') {
185029Spjd					*errnum = EBADF;
185029Spjd					return (-1);
185029Spjd				}
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		if (no_need_to_validate_entire)
185029Spjd			break;
185029Spjd	}
185029Spjd
185029Spjd	return (ret_val);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The do_case_conv() looks at the mapping tables and returns found
185029Spjd * bytes if any. If not found, the input bytes are returned. The function
185029Spjd * always terminate the return bytes with a null character assuming that
185029Spjd * there are plenty of room to do so.
185029Spjd *
185029Spjd * The case conversions are simple case conversions mapping a character to
185029Spjd * another character as specified in the Unicode data. The byte size of
185029Spjd * the mapped character could be different from that of the input character.
185029Spjd *
185029Spjd * The return value is the byte length of the returned character excluding
185029Spjd * the terminating null byte.
185029Spjd */
185029Spjdstatic size_t
185029Spjddo_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
185029Spjd{
185029Spjd	size_t i;
185029Spjd	uint16_t b1 = 0;
185029Spjd	uint16_t b2 = 0;
185029Spjd	uint16_t b3 = 0;
185029Spjd	uint16_t b3_tbl;
185029Spjd	uint16_t b3_base;
185029Spjd	uint16_t b4 = 0;
185029Spjd	size_t start_id;
185029Spjd	size_t end_id;
185029Spjd
185029Spjd	/*
185029Spjd	 * At this point, the only possible values for sz are 2, 3, and 4.
185029Spjd	 * The u8s should point to a vector that is well beyond the size of
185029Spjd	 * 5 bytes.
185029Spjd	 */
185029Spjd	if (sz == 2) {
185029Spjd		b3 = u8s[0] = s[0];
185029Spjd		b4 = u8s[1] = s[1];
185029Spjd	} else if (sz == 3) {
185029Spjd		b2 = u8s[0] = s[0];
185029Spjd		b3 = u8s[1] = s[1];
185029Spjd		b4 = u8s[2] = s[2];
185029Spjd	} else if (sz == 4) {
185029Spjd		b1 = u8s[0] = s[0];
185029Spjd		b2 = u8s[1] = s[1];
185029Spjd		b3 = u8s[2] = s[2];
185029Spjd		b4 = u8s[3] = s[3];
185029Spjd	} else {
185029Spjd		/* This is not possible but just in case as a fallback. */
185029Spjd		if (is_it_toupper)
185029Spjd			*u8s = U8_ASCII_TOUPPER(*s);
185029Spjd		else
185029Spjd			*u8s = U8_ASCII_TOLOWER(*s);
185029Spjd		u8s[1] = '\0';
185029Spjd
185029Spjd		return (1);
185029Spjd	}
185029Spjd	u8s[sz] = '\0';
185029Spjd
185029Spjd	/*
185029Spjd	 * Let's find out if we have a corresponding character.
185029Spjd	 */
185029Spjd	b1 = u8_common_b1_tbl[uv][b1];
185029Spjd	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	b2 = u8_case_common_b2_tbl[uv][b1][b2];
185029Spjd	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	if (is_it_toupper) {
185029Spjd		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
185029Spjd		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd			return ((size_t)sz);
185029Spjd
185029Spjd		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
185029Spjd
185029Spjd		/* Either there is no match or an error at the table. */
185029Spjd		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
185029Spjd			return ((size_t)sz);
185029Spjd
185029Spjd		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
185029Spjd
185029Spjd		for (i = 0; start_id < end_id; start_id++)
185029Spjd			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
185029Spjd	} else {
185029Spjd		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
185029Spjd		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd			return ((size_t)sz);
185029Spjd
185029Spjd		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
185029Spjd
185029Spjd		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
185029Spjd			return ((size_t)sz);
185029Spjd
185029Spjd		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
185029Spjd
185029Spjd		for (i = 0; start_id < end_id; start_id++)
185029Spjd			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * If i is still zero, that means there is no corresponding character.
185029Spjd	 */
185029Spjd	if (i == 0)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	u8s[i] = '\0';
185029Spjd
185029Spjd	return (i);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The do_case_compare() function compares the two input strings, s1 and s2,
185029Spjd * one character at a time doing case conversions if applicable and return
185029Spjd * the comparison result as like strcmp().
185029Spjd *
185029Spjd * Since, in empirical sense, most of text data are 7-bit ASCII characters,
185029Spjd * we treat the 7-bit ASCII characters as a special case trying to yield
185029Spjd * faster processing time.
185029Spjd */
185029Spjdstatic int
185029Spjddo_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
185029Spjd	size_t n2, boolean_t is_it_toupper, int *errnum)
185029Spjd{
185029Spjd	int f;
185029Spjd	int sz1;
185029Spjd	int sz2;
185029Spjd	size_t j;
185029Spjd	size_t i1;
185029Spjd	size_t i2;
185029Spjd	uchar_t u8s1[U8_MB_CUR_MAX + 1];
185029Spjd	uchar_t u8s2[U8_MB_CUR_MAX + 1];
185029Spjd
185029Spjd	i1 = i2 = 0;
185029Spjd	while (i1 < n1 && i2 < n2) {
185029Spjd		/*
185029Spjd		 * Find out what would be the byte length for this UTF-8
185029Spjd		 * character at string s1 and also find out if this is
185029Spjd		 * an illegal start byte or not and if so, issue a proper
185029Spjd		 * error number and yet treat this byte as a character.
185029Spjd		 */
185029Spjd		sz1 = u8_number_of_bytes[*s1];
185029Spjd		if (sz1 < 0) {
185029Spjd			*errnum = EILSEQ;
185029Spjd			sz1 = 1;
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * For 7-bit ASCII characters mainly, we do a quick case
185029Spjd		 * conversion right at here.
185029Spjd		 *
185029Spjd		 * If we don't have enough bytes for this character, issue
185029Spjd		 * an EINVAL error and use what are available.
185029Spjd		 *
185029Spjd		 * If we have enough bytes, find out if there is
185029Spjd		 * a corresponding uppercase character and if so, copy over
185029Spjd		 * the bytes for a comparison later. If there is no
185029Spjd		 * corresponding uppercase character, then, use what we have
185029Spjd		 * for the comparison.
185029Spjd		 */
185029Spjd		if (sz1 == 1) {
185029Spjd			if (is_it_toupper)
185029Spjd				u8s1[0] = U8_ASCII_TOUPPER(*s1);
185029Spjd			else
185029Spjd				u8s1[0] = U8_ASCII_TOLOWER(*s1);
185029Spjd			s1++;
185029Spjd			u8s1[1] = '\0';
185029Spjd		} else if ((i1 + sz1) > n1) {
185029Spjd			*errnum = EINVAL;
185029Spjd			for (j = 0; (i1 + j) < n1; )
185029Spjd				u8s1[j++] = *s1++;
185029Spjd			u8s1[j] = '\0';
185029Spjd		} else {
185029Spjd			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
185029Spjd			s1 += sz1;
185029Spjd		}
185029Spjd
185029Spjd		/* Do the same for the string s2. */
185029Spjd		sz2 = u8_number_of_bytes[*s2];
185029Spjd		if (sz2 < 0) {
185029Spjd			*errnum = EILSEQ;
185029Spjd			sz2 = 1;
185029Spjd		}
185029Spjd
185029Spjd		if (sz2 == 1) {
185029Spjd			if (is_it_toupper)
185029Spjd				u8s2[0] = U8_ASCII_TOUPPER(*s2);
185029Spjd			else
185029Spjd				u8s2[0] = U8_ASCII_TOLOWER(*s2);
185029Spjd			s2++;
185029Spjd			u8s2[1] = '\0';
185029Spjd		} else if ((i2 + sz2) > n2) {
185029Spjd			*errnum = EINVAL;
185029Spjd			for (j = 0; (i2 + j) < n2; )
185029Spjd				u8s2[j++] = *s2++;
185029Spjd			u8s2[j] = '\0';
185029Spjd		} else {
185029Spjd			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
185029Spjd			s2 += sz2;
185029Spjd		}
185029Spjd
185029Spjd		/* Now compare the two characters. */
185029Spjd		if (sz1 == 1 && sz2 == 1) {
185029Spjd			if (*u8s1 > *u8s2)
185029Spjd				return (1);
185029Spjd			if (*u8s1 < *u8s2)
185029Spjd				return (-1);
185029Spjd		} else {
185029Spjd			f = strcmp((const char *)u8s1, (const char *)u8s2);
185029Spjd			if (f != 0)
185029Spjd				return (f);
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * They were the same. Let's move on to the next
185029Spjd		 * characters then.
185029Spjd		 */
185029Spjd		i1 += sz1;
185029Spjd		i2 += sz2;
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * We compared until the end of either or both strings.
185029Spjd	 *
185029Spjd	 * If we reached to or went over the ends for the both, that means
185029Spjd	 * they are the same.
185029Spjd	 *
185029Spjd	 * If we reached only one of the two ends, that means the other string
185029Spjd	 * has something which then the fact can be used to determine
185029Spjd	 * the return value.
185029Spjd	 */
185029Spjd	if (i1 >= n1) {
185029Spjd		if (i2 >= n2)
185029Spjd			return (0);
185029Spjd		return (-1);
185029Spjd	}
185029Spjd	return (1);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The combining_class() function checks on the given bytes and find out
185029Spjd * the corresponding Unicode combining class value. The return value 0 means
185029Spjd * it is a Starter. Any illegal UTF-8 character will also be treated as
185029Spjd * a Starter.
185029Spjd */
185029Spjdstatic uchar_t
185029Spjdcombining_class(size_t uv, uchar_t *s, size_t sz)
185029Spjd{
185029Spjd	uint16_t b1 = 0;
185029Spjd	uint16_t b2 = 0;
185029Spjd	uint16_t b3 = 0;
185029Spjd	uint16_t b4 = 0;
185029Spjd
185029Spjd	if (sz == 1 || sz > 4)
185029Spjd		return (0);
185029Spjd
185029Spjd	if (sz == 2) {
185029Spjd		b3 = s[0];
185029Spjd		b4 = s[1];
185029Spjd	} else if (sz == 3) {
185029Spjd		b2 = s[0];
185029Spjd		b3 = s[1];
185029Spjd		b4 = s[2];
185029Spjd	} else if (sz == 4) {
185029Spjd		b1 = s[0];
185029Spjd		b2 = s[1];
185029Spjd		b3 = s[2];
185029Spjd		b4 = s[3];
185029Spjd	}
185029Spjd
185029Spjd	b1 = u8_common_b1_tbl[uv][b1];
185029Spjd	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (0);
185029Spjd
185029Spjd	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
185029Spjd	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (0);
185029Spjd
185029Spjd	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
185029Spjd	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (0);
185029Spjd
185029Spjd	return (u8_combining_class_b4_tbl[uv][b3][b4]);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The do_decomp() function finds out a matching decomposition if any
185029Spjd * and return. If there is no match, the input bytes are copied and returned.
185029Spjd * The function also checks if there is a Hangul, decomposes it if necessary
185029Spjd * and returns.
185029Spjd *
185029Spjd * To save time, a single byte 7-bit ASCII character should be handled by
185029Spjd * the caller.
185029Spjd *
185029Spjd * The function returns the number of bytes returned sans always terminating
185029Spjd * the null byte. It will also return a state that will tell if there was
185029Spjd * a Hangul character decomposed which then will be used by the caller.
185029Spjd */
185029Spjdstatic size_t
185029Spjddo_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
185029Spjd	boolean_t canonical_decomposition, u8_normalization_states_t *state)
185029Spjd{
185029Spjd	uint16_t b1 = 0;
185029Spjd	uint16_t b2 = 0;
185029Spjd	uint16_t b3 = 0;
185029Spjd	uint16_t b3_tbl;
185029Spjd	uint16_t b3_base;
185029Spjd	uint16_t b4 = 0;
185029Spjd	size_t start_id;
185029Spjd	size_t end_id;
185029Spjd	size_t i;
185029Spjd	uint32_t u1;
185029Spjd
185029Spjd	if (sz == 2) {
185029Spjd		b3 = u8s[0] = s[0];
185029Spjd		b4 = u8s[1] = s[1];
185029Spjd		u8s[2] = '\0';
185029Spjd	} else if (sz == 3) {
185029Spjd		/* Convert it to a Unicode scalar value. */
185029Spjd		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
185029Spjd
185029Spjd		/*
185029Spjd		 * If this is a Hangul syllable, we decompose it into
185029Spjd		 * a leading consonant, a vowel, and an optional trailing
185029Spjd		 * consonant and then return.
185029Spjd		 */
185029Spjd		if (U8_HANGUL_SYLLABLE(u1)) {
185029Spjd			u1 -= U8_HANGUL_SYL_FIRST;
185029Spjd
185029Spjd			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
185029Spjd			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
185029Spjd			    / U8_HANGUL_T_COUNT;
185029Spjd			b3 = u1 % U8_HANGUL_T_COUNT;
185029Spjd
185029Spjd			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
185029Spjd			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
185029Spjd			if (b3) {
185029Spjd				b3 += U8_HANGUL_JAMO_T_FIRST;
185029Spjd				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
185029Spjd
185029Spjd				u8s[9] = '\0';
185029Spjd				*state = U8_STATE_HANGUL_LVT;
185029Spjd				return (9);
185029Spjd			}
185029Spjd
185029Spjd			u8s[6] = '\0';
185029Spjd			*state = U8_STATE_HANGUL_LV;
185029Spjd			return (6);
185029Spjd		}
185029Spjd
185029Spjd		b2 = u8s[0] = s[0];
185029Spjd		b3 = u8s[1] = s[1];
185029Spjd		b4 = u8s[2] = s[2];
185029Spjd		u8s[3] = '\0';
185029Spjd
185029Spjd		/*
185029Spjd		 * If this is a Hangul Jamo, we know there is nothing
185029Spjd		 * further that we can decompose.
185029Spjd		 */
185029Spjd		if (U8_HANGUL_JAMO_L(u1)) {
185029Spjd			*state = U8_STATE_HANGUL_L;
185029Spjd			return (3);
185029Spjd		}
185029Spjd
185029Spjd		if (U8_HANGUL_JAMO_V(u1)) {
185029Spjd			if (*state == U8_STATE_HANGUL_L)
185029Spjd				*state = U8_STATE_HANGUL_LV;
185029Spjd			else
185029Spjd				*state = U8_STATE_HANGUL_V;
185029Spjd			return (3);
185029Spjd		}
185029Spjd
185029Spjd		if (U8_HANGUL_JAMO_T(u1)) {
185029Spjd			if (*state == U8_STATE_HANGUL_LV)
185029Spjd				*state = U8_STATE_HANGUL_LVT;
185029Spjd			else
185029Spjd				*state = U8_STATE_HANGUL_T;
185029Spjd			return (3);
185029Spjd		}
185029Spjd	} else if (sz == 4) {
185029Spjd		b1 = u8s[0] = s[0];
185029Spjd		b2 = u8s[1] = s[1];
185029Spjd		b3 = u8s[2] = s[2];
185029Spjd		b4 = u8s[3] = s[3];
185029Spjd		u8s[4] = '\0';
185029Spjd	} else {
185029Spjd		/*
185029Spjd		 * This is a fallback and should not happen if the function
185029Spjd		 * was called properly.
185029Spjd		 */
185029Spjd		u8s[0] = s[0];
185029Spjd		u8s[1] = '\0';
185029Spjd		*state = U8_STATE_START;
185029Spjd		return (1);
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * At this point, this rountine does not know what it would get.
185029Spjd	 * The caller should sort it out if the state isn't a Hangul one.
185029Spjd	 */
185029Spjd	*state = U8_STATE_START;
185029Spjd
185029Spjd	/* Try to find matching decomposition mapping byte sequence. */
185029Spjd	b1 = u8_common_b1_tbl[uv][b1];
185029Spjd	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	b2 = u8_decomp_b2_tbl[uv][b1][b2];
185029Spjd	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
185029Spjd	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	/*
185029Spjd	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
185029Spjd	 * which is 0x8000, this means we couldn't fit the mappings into
185029Spjd	 * the cardinality of a unsigned byte.
185029Spjd	 */
185029Spjd	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
185029Spjd		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
185029Spjd		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
185029Spjd	} else {
185029Spjd		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
185029Spjd	}
185029Spjd
185029Spjd	/* This also means there wasn't any matching decomposition. */
185029Spjd	if (start_id >= end_id)
185029Spjd		return ((size_t)sz);
185029Spjd
185029Spjd	/*
185029Spjd	 * The final table for decomposition mappings has three types of
185029Spjd	 * byte sequences depending on whether a mapping is for compatibility
185029Spjd	 * decomposition, canonical decomposition, or both like the following:
185029Spjd	 *
185029Spjd	 * (1) Compatibility decomposition mappings:
185029Spjd	 *
185029Spjd	 *	+---+---+-...-+---+
185029Spjd	 *	| B0| B1| ... | Bm|
185029Spjd	 *	+---+---+-...-+---+
185029Spjd	 *
185029Spjd	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
185029Spjd	 *
185029Spjd	 * (2) Canonical decomposition mappings:
185029Spjd	 *
185029Spjd	 *	+---+---+---+-...-+---+
185029Spjd	 *	| T | b0| b1| ... | bn|
185029Spjd	 *	+---+---+---+-...-+---+
185029Spjd	 *
185029Spjd	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
185029Spjd	 *
185029Spjd	 * (3) Both mappings:
185029Spjd	 *
185029Spjd	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
185029Spjd	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
185029Spjd	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
185029Spjd	 *
185029Spjd	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
185029Spjd	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
185029Spjd	 *	compatibility mapping bytes.
185029Spjd	 *
185029Spjd	 * Note that compatibility decomposition means doing recursive
185029Spjd	 * decompositions using both compatibility decomposition mappings and
185029Spjd	 * canonical decomposition mappings. On the other hand, canonical
185029Spjd	 * decomposition means doing recursive decompositions using only
185029Spjd	 * canonical decomposition mappings. Since the table we have has gone
185029Spjd	 * through the recursions already, we do not need to do so during
185029Spjd	 * runtime, i.e., the table has been completely flattened out
185029Spjd	 * already.
185029Spjd	 */
185029Spjd
185029Spjd	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
185029Spjd
185029Spjd	/* Get the type, T, of the byte sequence. */
185029Spjd	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
185029Spjd
185029Spjd	/*
185029Spjd	 * If necessary, adjust start_id, end_id, or both. Note that if
185029Spjd	 * this is compatibility decomposition mapping, there is no
185029Spjd	 * adjustment.
185029Spjd	 */
185029Spjd	if (canonical_decomposition) {
185029Spjd		/* Is the mapping only for compatibility decomposition? */
185029Spjd		if (b1 < U8_DECOMP_BOTH)
185029Spjd			return ((size_t)sz);
185029Spjd
185029Spjd		start_id++;
185029Spjd
185029Spjd		if (b1 == U8_DECOMP_BOTH) {
185029Spjd			end_id = start_id +
185029Spjd			    u8_decomp_final_tbl[uv][b3_base + start_id];
185029Spjd			start_id++;
185029Spjd		}
185029Spjd	} else {
185029Spjd		/*
185029Spjd		 * Unless this is a compatibility decomposition mapping,
185029Spjd		 * we adjust the start_id.
185029Spjd		 */
185029Spjd		if (b1 == U8_DECOMP_BOTH) {
185029Spjd			start_id++;
185029Spjd			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
185029Spjd		} else if (b1 == U8_DECOMP_CANONICAL) {
185029Spjd			start_id++;
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	for (i = 0; start_id < end_id; start_id++)
185029Spjd		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
185029Spjd	u8s[i] = '\0';
185029Spjd
185029Spjd	return (i);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The find_composition_start() function uses the character bytes given and
185029Spjd * find out the matching composition mappings if any and return the address
185029Spjd * to the composition mappings as explained in the do_composition().
185029Spjd */
185029Spjdstatic uchar_t *
185029Spjdfind_composition_start(size_t uv, uchar_t *s, size_t sz)
185029Spjd{
185029Spjd	uint16_t b1 = 0;
185029Spjd	uint16_t b2 = 0;
185029Spjd	uint16_t b3 = 0;
185029Spjd	uint16_t b3_tbl;
185029Spjd	uint16_t b3_base;
185029Spjd	uint16_t b4 = 0;
185029Spjd	size_t start_id;
185029Spjd	size_t end_id;
185029Spjd
185029Spjd	if (sz == 1) {
185029Spjd		b4 = s[0];
185029Spjd	} else if (sz == 2) {
185029Spjd		b3 = s[0];
185029Spjd		b4 = s[1];
185029Spjd	} else if (sz == 3) {
185029Spjd		b2 = s[0];
185029Spjd		b3 = s[1];
185029Spjd		b4 = s[2];
185029Spjd	} else if (sz == 4) {
185029Spjd		b1 = s[0];
185029Spjd		b2 = s[1];
185029Spjd		b3 = s[2];
185029Spjd		b4 = s[3];
185029Spjd	} else {
185029Spjd		/*
185029Spjd		 * This is a fallback and should not happen if the function
185029Spjd		 * was called properly.
185029Spjd		 */
185029Spjd		return (NULL);
185029Spjd	}
185029Spjd
185029Spjd	b1 = u8_composition_b1_tbl[uv][b1];
185029Spjd	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (NULL);
185029Spjd
185029Spjd	b2 = u8_composition_b2_tbl[uv][b1][b2];
185029Spjd	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (NULL);
185029Spjd
185029Spjd	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
185029Spjd	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
185029Spjd		return (NULL);
185029Spjd
185029Spjd	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
185029Spjd		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
185029Spjd		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
185029Spjd	} else {
185029Spjd		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
185029Spjd		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
185029Spjd	}
185029Spjd
185029Spjd	if (start_id >= end_id)
185029Spjd		return (NULL);
185029Spjd
185029Spjd	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
185029Spjd
185029Spjd	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The blocked() function checks on the combining class values of previous
185029Spjd * characters in this sequence and return whether it is blocked or not.
185029Spjd */
185029Spjdstatic boolean_t
185029Spjdblocked(uchar_t *comb_class, size_t last)
185029Spjd{
185029Spjd	uchar_t my_comb_class;
185029Spjd	size_t i;
185029Spjd
185029Spjd	my_comb_class = comb_class[last];
185029Spjd	for (i = 1; i < last; i++)
185029Spjd		if (comb_class[i] >= my_comb_class ||
185029Spjd		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
185029Spjd			return (B_TRUE);
185029Spjd
185029Spjd	return (B_FALSE);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The do_composition() reads the character string pointed by 's' and
185029Spjd * do necessary canonical composition and then copy over the result back to
185029Spjd * the 's'.
185029Spjd *
185029Spjd * The input argument 's' cannot contain more than 32 characters.
185029Spjd */
185029Spjdstatic size_t
185029Spjddo_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
185029Spjd	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
185029Spjd{
185029Spjd	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
185029Spjd	uchar_t tc[U8_MB_CUR_MAX];
185029Spjd	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
185029Spjd	size_t saved_marks_count;
185029Spjd	uchar_t *p;
185029Spjd	uchar_t *saved_p;
185029Spjd	uchar_t *q;
185029Spjd	size_t i;
185029Spjd	size_t saved_i;
185029Spjd	size_t j;
185029Spjd	size_t k;
185029Spjd	size_t l;
185029Spjd	size_t C;
185029Spjd	size_t saved_l;
185029Spjd	size_t size;
185029Spjd	uint32_t u1;
185029Spjd	uint32_t u2;
185029Spjd	boolean_t match_not_found = B_TRUE;
185029Spjd
185029Spjd	/*
185029Spjd	 * This should never happen unless the callers are doing some strange
185029Spjd	 * and unexpected things.
185029Spjd	 *
185029Spjd	 * The "last" is the index pointing to the last character not last + 1.
185029Spjd	 */
185029Spjd	if (last >= U8_MAX_CHARS_A_SEQ)
185029Spjd		last = U8_UPPER_LIMIT_IN_A_SEQ;
185029Spjd
185029Spjd	for (i = l = 0; i <= last; i++) {
185029Spjd		/*
185029Spjd		 * The last or any non-Starters at the beginning, we don't
185029Spjd		 * have any chance to do composition and so we just copy them
185029Spjd		 * to the temporary buffer.
185029Spjd		 */
185029Spjd		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
185029SpjdSAVE_THE_CHAR:
185029Spjd			p = s + start[i];
185029Spjd			size = disp[i];
185029Spjd			for (k = 0; k < size; k++)
185029Spjd				t[l++] = *p++;
185029Spjd			continue;
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * If this could be a start of Hangul Jamos, then, we try to
185029Spjd		 * conjoin them.
185029Spjd		 */
185029Spjd		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
185029Spjd			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
185029Spjd			    s[start[i] + 1], s[start[i] + 2]);
185029Spjd			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
185029Spjd			    s[start[i] + 4], s[start[i] + 5]);
185029Spjd
185029Spjd			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
185029Spjd				u1 -= U8_HANGUL_JAMO_L_FIRST;
185029Spjd				u2 -= U8_HANGUL_JAMO_V_FIRST;
185029Spjd				u1 = U8_HANGUL_SYL_FIRST +
185029Spjd				    (u1 * U8_HANGUL_V_COUNT + u2) *
185029Spjd				    U8_HANGUL_T_COUNT;
185029Spjd
185029Spjd				i += 2;
185029Spjd				if (i <= last) {
185029Spjd					U8_PUT_3BYTES_INTO_UTF32(u2,
185029Spjd					    s[start[i]], s[start[i] + 1],
185029Spjd					    s[start[i] + 2]);
185029Spjd
185029Spjd					if (U8_HANGUL_JAMO_T(u2)) {
185029Spjd						u1 += u2 -
185029Spjd						    U8_HANGUL_JAMO_T_FIRST;
185029Spjd						i++;
185029Spjd					}
185029Spjd				}
185029Spjd
185029Spjd				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
185029Spjd				i--;
185029Spjd				l += 3;
185029Spjd				continue;
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * Let's then find out if this Starter has composition
185029Spjd		 * mapping.
185029Spjd		 */
185029Spjd		p = find_composition_start(uv, s + start[i], disp[i]);
185029Spjd		if (p == NULL)
185029Spjd			goto SAVE_THE_CHAR;
185029Spjd
185029Spjd		/*
185029Spjd		 * We have a Starter with composition mapping and the next
185029Spjd		 * character is a non-Starter. Let's try to find out if
185029Spjd		 * we can do composition.
185029Spjd		 */
185029Spjd
185029Spjd		saved_p = p;
185029Spjd		saved_i = i;
185029Spjd		saved_l = l;
185029Spjd		saved_marks_count = 0;
185029Spjd
185029SpjdTRY_THE_NEXT_MARK:
185029Spjd		q = s + start[++i];
185029Spjd		size = disp[i];
185029Spjd
185029Spjd		/*
185029Spjd		 * The next for() loop compares the non-Starter pointed by
185029Spjd		 * 'q' with the possible (joinable) characters pointed by 'p'.
185029Spjd		 *
185029Spjd		 * The composition final table entry pointed by the 'p'
185029Spjd		 * looks like the following:
185029Spjd		 *
185029Spjd		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
185029Spjd		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
185029Spjd		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
185029Spjd		 *
185029Spjd		 * where C is the count byte indicating the number of
185029Spjd		 * mapping pairs where each pair would be look like
185029Spjd		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
185029Spjd		 * character of a canonical decomposition and the B0-Bm are
185029Spjd		 * the bytes of a matching composite character. The F is
185029Spjd		 * a filler byte after each character as the separator.
185029Spjd		 */
185029Spjd
185029Spjd		match_not_found = B_TRUE;
185029Spjd
185029Spjd		for (C = *p++; C > 0; C--) {
185029Spjd			for (k = 0; k < size; p++, k++)
185029Spjd				if (*p != q[k])
185029Spjd					break;
185029Spjd
185029Spjd			/* Have we found it? */
185029Spjd			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
185029Spjd				match_not_found = B_FALSE;
185029Spjd
185029Spjd				l = saved_l;
185029Spjd
185029Spjd				while (*++p != U8_TBL_ELEMENT_FILLER)
185029Spjd					t[l++] = *p;
185029Spjd
185029Spjd				break;
185029Spjd			}
185029Spjd
185029Spjd			/* We didn't find; skip to the next pair. */
185029Spjd			if (*p != U8_TBL_ELEMENT_FILLER)
185029Spjd				while (*++p != U8_TBL_ELEMENT_FILLER)
185029Spjd					;
185029Spjd			while (*++p != U8_TBL_ELEMENT_FILLER)
185029Spjd				;
185029Spjd			p++;
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * If there was no match, we will need to save the combining
185029Spjd		 * mark for later appending. After that, if the next one
185029Spjd		 * is a non-Starter and not blocked, then, we try once
185029Spjd		 * again to do composition with the next non-Starter.
185029Spjd		 *
185029Spjd		 * If there was no match and this was a Starter, then,
185029Spjd		 * this is a new start.
185029Spjd		 *
185029Spjd		 * If there was a match and a composition done and we have
185029Spjd		 * more to check on, then, we retrieve a new composition final
185029Spjd		 * table entry for the composite and then try to do the
185029Spjd		 * composition again.
185029Spjd		 */
185029Spjd
185029Spjd		if (match_not_found) {
185029Spjd			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
185029Spjd				i--;
185029Spjd				goto SAVE_THE_CHAR;
185029Spjd			}
185029Spjd
185029Spjd			saved_marks[saved_marks_count++] = i;
185029Spjd		}
185029Spjd
185029Spjd		if (saved_l == l) {
185029Spjd			while (i < last) {
185029Spjd				if (blocked(comb_class, i + 1))
185029Spjd					saved_marks[saved_marks_count++] = ++i;
185029Spjd				else
185029Spjd					break;
185029Spjd			}
185029Spjd			if (i < last) {
185029Spjd				p = saved_p;
185029Spjd				goto TRY_THE_NEXT_MARK;
185029Spjd			}
185029Spjd		} else if (i < last) {
185029Spjd			p = find_composition_start(uv, t + saved_l,
185029Spjd			    l - saved_l);
185029Spjd			if (p != NULL) {
185029Spjd				saved_p = p;
185029Spjd				goto TRY_THE_NEXT_MARK;
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * There is no more composition possible.
185029Spjd		 *
185029Spjd		 * If there was no composition what so ever then we copy
185029Spjd		 * over the original Starter and then append any non-Starters
185029Spjd		 * remaining at the target string sequentially after that.
185029Spjd		 */
185029Spjd
185029Spjd		if (saved_l == l) {
185029Spjd			p = s + start[saved_i];
185029Spjd			size = disp[saved_i];
185029Spjd			for (j = 0; j < size; j++)
185029Spjd				t[l++] = *p++;
185029Spjd		}
185029Spjd
185029Spjd		for (k = 0; k < saved_marks_count; k++) {
185029Spjd			p = s + start[saved_marks[k]];
185029Spjd			size = disp[saved_marks[k]];
185029Spjd			for (j = 0; j < size; j++)
185029Spjd				t[l++] = *p++;
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * If the last character is a Starter and if we have a character
185029Spjd	 * (possibly another Starter) that can be turned into a composite,
185029Spjd	 * we do so and we do so until there is no more of composition
185029Spjd	 * possible.
185029Spjd	 */
185029Spjd	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
185029Spjd		p = *os;
185029Spjd		saved_l = l - disp[last];
185029Spjd
185029Spjd		while (p < oslast) {
185029Spjd			size = u8_number_of_bytes[*p];
185029Spjd			if (size <= 1 || (p + size) > oslast)
185029Spjd				break;
185029Spjd
185029Spjd			saved_p = p;
185029Spjd
185029Spjd			for (i = 0; i < size; i++)
185029Spjd				tc[i] = *p++;
185029Spjd
185029Spjd			q = find_composition_start(uv, t + saved_l,
185029Spjd			    l - saved_l);
185029Spjd			if (q == NULL) {
185029Spjd				p = saved_p;
185029Spjd				break;
185029Spjd			}
185029Spjd
185029Spjd			match_not_found = B_TRUE;
185029Spjd
185029Spjd			for (C = *q++; C > 0; C--) {
185029Spjd				for (k = 0; k < size; q++, k++)
185029Spjd					if (*q != tc[k])
185029Spjd						break;
185029Spjd
185029Spjd				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
185029Spjd					match_not_found = B_FALSE;
185029Spjd
185029Spjd					l = saved_l;
185029Spjd
185029Spjd					while (*++q != U8_TBL_ELEMENT_FILLER) {
185029Spjd						/*
185029Spjd						 * This is practically
185029Spjd						 * impossible but we don't
185029Spjd						 * want to take any chances.
185029Spjd						 */
185029Spjd						if (l >=
185029Spjd						    U8_STREAM_SAFE_TEXT_MAX) {
185029Spjd							p = saved_p;
185029Spjd							goto SAFE_RETURN;
185029Spjd						}
185029Spjd						t[l++] = *q;
185029Spjd					}
185029Spjd
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				if (*q != U8_TBL_ELEMENT_FILLER)
185029Spjd					while (*++q != U8_TBL_ELEMENT_FILLER)
185029Spjd						;
185029Spjd				while (*++q != U8_TBL_ELEMENT_FILLER)
185029Spjd					;
185029Spjd				q++;
185029Spjd			}
185029Spjd
185029Spjd			if (match_not_found) {
185029Spjd				p = saved_p;
185029Spjd				break;
185029Spjd			}
185029Spjd		}
185029SpjdSAFE_RETURN:
185029Spjd		*os = p;
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * Now we copy over the temporary string to the target string.
185029Spjd	 * Since composition always reduces the number of characters or
185029Spjd	 * the number of characters stay, we don't need to worry about
185029Spjd	 * the buffer overflow here.
185029Spjd	 */
185029Spjd	for (i = 0; i < l; i++)
185029Spjd		s[i] = t[i];
185029Spjd	s[l] = '\0';
185029Spjd
185029Spjd	return (l);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The collect_a_seq() function checks on the given string s, collect
185029Spjd * a sequence of characters at u8s, and return the sequence. While it collects
185029Spjd * a sequence, it also applies case conversion, canonical or compatibility
185029Spjd * decomposition, canonical decomposition, or some or all of them and
185029Spjd * in that order.
185029Spjd *
185029Spjd * The collected sequence cannot be bigger than 32 characters since if
185029Spjd * it is having more than 31 characters, the sequence will be terminated
185029Spjd * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
185029Spjd * a Stream-Safe Text. The collected sequence is always terminated with
185029Spjd * a null byte and the return value is the byte length of the sequence
185029Spjd * including 0. The return value does not include the terminating
185029Spjd * null byte.
185029Spjd */
185029Spjdstatic size_t
185029Spjdcollect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
185029Spjd	boolean_t is_it_toupper,
185029Spjd	boolean_t is_it_tolower,
185029Spjd	boolean_t canonical_decomposition,
185029Spjd	boolean_t compatibility_decomposition,
185029Spjd	boolean_t canonical_composition,
185029Spjd	int *errnum, u8_normalization_states_t *state)
185029Spjd{
185029Spjd	uchar_t *s;
185029Spjd	int sz;
185029Spjd	int saved_sz;
185029Spjd	size_t i;
185029Spjd	size_t j;
185029Spjd	size_t k;
185029Spjd	size_t l;
185029Spjd	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
185029Spjd	uchar_t disp[U8_MAX_CHARS_A_SEQ];
185029Spjd	uchar_t start[U8_MAX_CHARS_A_SEQ];
185029Spjd	uchar_t u8t[U8_MB_CUR_MAX];
185029Spjd	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
185029Spjd	uchar_t tc;
185029Spjd	size_t last;
185029Spjd	size_t saved_last;
185029Spjd	uint32_t u1;
185029Spjd
185029Spjd	/*
185029Spjd	 * Save the source string pointer which we will return a changed
185029Spjd	 * pointer if we do processing.
185029Spjd	 */
185029Spjd	s = *source;
185029Spjd
185029Spjd	/*
185029Spjd	 * The following is a fallback for just in case callers are not
185029Spjd	 * checking the string boundaries before the calling.
185029Spjd	 */
185029Spjd	if (s >= slast) {
185029Spjd		u8s[0] = '\0';
185029Spjd
185029Spjd		return (0);
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * As the first thing, let's collect a character and do case
185029Spjd	 * conversion if necessary.
185029Spjd	 */
185029Spjd
185029Spjd	sz = u8_number_of_bytes[*s];
185029Spjd
185029Spjd	if (sz < 0) {
185029Spjd		*errnum = EILSEQ;
185029Spjd
185029Spjd		u8s[0] = *s++;
185029Spjd		u8s[1] = '\0';
185029Spjd
185029Spjd		*source = s;
185029Spjd
185029Spjd		return (1);
185029Spjd	}
185029Spjd
185029Spjd	if (sz == 1) {
185029Spjd		if (is_it_toupper)
185029Spjd			u8s[0] = U8_ASCII_TOUPPER(*s);
185029Spjd		else if (is_it_tolower)
185029Spjd			u8s[0] = U8_ASCII_TOLOWER(*s);
185029Spjd		else
185029Spjd			u8s[0] = *s;
185029Spjd		s++;
185029Spjd		u8s[1] = '\0';
185029Spjd	} else if ((s + sz) > slast) {
185029Spjd		*errnum = EINVAL;
185029Spjd
185029Spjd		for (i = 0; s < slast; )
185029Spjd			u8s[i++] = *s++;
185029Spjd		u8s[i] = '\0';
185029Spjd
185029Spjd		*source = s;
185029Spjd
185029Spjd		return (i);
185029Spjd	} else {
185029Spjd		if (is_it_toupper || is_it_tolower) {
185029Spjd			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
185029Spjd			s += sz;
185029Spjd			sz = i;
185029Spjd		} else {
185029Spjd			for (i = 0; i < sz; )
185029Spjd				u8s[i++] = *s++;
185029Spjd			u8s[i] = '\0';
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * And then canonical/compatibility decomposition followed by
185029Spjd	 * an optional canonical composition. Please be noted that
185029Spjd	 * canonical composition is done only when a decomposition is
185029Spjd	 * done.
185029Spjd	 */
185029Spjd	if (canonical_decomposition || compatibility_decomposition) {
185029Spjd		if (sz == 1) {
185029Spjd			*state = U8_STATE_START;
185029Spjd
185029Spjd			saved_sz = 1;
185029Spjd
185029Spjd			comb_class[0] = 0;
185029Spjd			start[0] = 0;
185029Spjd			disp[0] = 1;
185029Spjd
185029Spjd			last = 1;
185029Spjd		} else {
185029Spjd			saved_sz = do_decomp(uv, u8s, u8s, sz,
185029Spjd			    canonical_decomposition, state);
185029Spjd
185029Spjd			last = 0;
185029Spjd
185029Spjd			for (i = 0; i < saved_sz; ) {
185029Spjd				sz = u8_number_of_bytes[u8s[i]];
185029Spjd
185029Spjd				comb_class[last] = combining_class(uv,
185029Spjd				    u8s + i, sz);
185029Spjd				start[last] = i;
185029Spjd				disp[last] = sz;
185029Spjd
185029Spjd				last++;
185029Spjd				i += sz;
185029Spjd			}
185029Spjd
185029Spjd			/*
185029Spjd			 * Decomposition yields various Hangul related
185029Spjd			 * states but not on combining marks. We need to
185029Spjd			 * find out at here by checking on the last
185029Spjd			 * character.
185029Spjd			 */
185029Spjd			if (*state == U8_STATE_START) {
185029Spjd				if (comb_class[last - 1])
185029Spjd					*state = U8_STATE_COMBINING_MARK;
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		saved_last = last;
185029Spjd
185029Spjd		while (s < slast) {
185029Spjd			sz = u8_number_of_bytes[*s];
185029Spjd
185029Spjd			/*
185029Spjd			 * If this is an illegal character, an incomplete
185029Spjd			 * character, or an 7-bit ASCII Starter character,
185029Spjd			 * then we have collected a sequence; break and let
185029Spjd			 * the next call deal with the two cases.
185029Spjd			 *
185029Spjd			 * Note that this is okay only if you are using this
185029Spjd			 * function with a fixed length string, not on
185029Spjd			 * a buffer with multiple calls of one chunk at a time.
185029Spjd			 */
185029Spjd			if (sz <= 1) {
185029Spjd				break;
185029Spjd			} else if ((s + sz) > slast) {
185029Spjd				break;
185029Spjd			} else {
185029Spjd				/*
185029Spjd				 * If the previous character was a Hangul Jamo
185029Spjd				 * and this character is a Hangul Jamo that
185029Spjd				 * can be conjoined, we collect the Jamo.
185029Spjd				 */
185029Spjd				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
185029Spjd					U8_PUT_3BYTES_INTO_UTF32(u1,
185029Spjd					    *s, *(s + 1), *(s + 2));
185029Spjd
185029Spjd					if (U8_HANGUL_COMPOSABLE_L_V(*state,
185029Spjd					    u1)) {
185029Spjd						i = 0;
185029Spjd						*state = U8_STATE_HANGUL_LV;
185029Spjd						goto COLLECT_A_HANGUL;
185029Spjd					}
185029Spjd
185029Spjd					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
185029Spjd					    u1)) {
185029Spjd						i = 0;
185029Spjd						*state = U8_STATE_HANGUL_LVT;
185029Spjd						goto COLLECT_A_HANGUL;
185029Spjd					}
185029Spjd				}
185029Spjd
185029Spjd				/*
185029Spjd				 * Regardless of whatever it was, if this is
185029Spjd				 * a Starter, we don't collect the character
185029Spjd				 * since that's a new start and we will deal
185029Spjd				 * with it at the next time.
185029Spjd				 */
185029Spjd				i = combining_class(uv, s, sz);
185029Spjd				if (i == U8_COMBINING_CLASS_STARTER)
185029Spjd					break;
185029Spjd
185029Spjd				/*
185029Spjd				 * We know the current character is a combining
185029Spjd				 * mark. If the previous character wasn't
185029Spjd				 * a Starter (not Hangul) or a combining mark,
185029Spjd				 * then, we don't collect this combining mark.
185029Spjd				 */
185029Spjd				if (*state != U8_STATE_START &&
185029Spjd				    *state != U8_STATE_COMBINING_MARK)
185029Spjd					break;
185029Spjd
185029Spjd				*state = U8_STATE_COMBINING_MARK;
185029SpjdCOLLECT_A_HANGUL:
185029Spjd				/*
185029Spjd				 * If we collected a Starter and combining
185029Spjd				 * marks up to 30, i.e., total 31 characters,
185029Spjd				 * then, we terminate this degenerately long
185029Spjd				 * combining sequence with a U+034F COMBINING
185029Spjd				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
185029Spjd				 * UTF-8 and turn this into a Stream-Safe
185029Spjd				 * Text. This will be extremely rare but
185029Spjd				 * possible.
185029Spjd				 *
185029Spjd				 * The following will also guarantee that
185029Spjd				 * we are not writing more than 32 characters
185029Spjd				 * plus a NULL at u8s[].
185029Spjd				 */
185029Spjd				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
185029SpjdTURN_STREAM_SAFE:
185029Spjd					*state = U8_STATE_START;
185029Spjd					comb_class[last] = 0;
185029Spjd					start[last] = saved_sz;
185029Spjd					disp[last] = 2;
185029Spjd					last++;
185029Spjd
185029Spjd					u8s[saved_sz++] = 0xCD;
185029Spjd					u8s[saved_sz++] = 0x8F;
185029Spjd
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				/*
185029Spjd				 * Some combining marks also do decompose into
185029Spjd				 * another combining mark or marks.
185029Spjd				 */
185029Spjd				if (*state == U8_STATE_COMBINING_MARK) {
185029Spjd					k = last;
185029Spjd					l = sz;
185029Spjd					i = do_decomp(uv, uts, s, sz,
185029Spjd					    canonical_decomposition, state);
185029Spjd					for (j = 0; j < i; ) {
185029Spjd						sz = u8_number_of_bytes[uts[j]];
185029Spjd
185029Spjd						comb_class[last] =
185029Spjd						    combining_class(uv,
185029Spjd						    uts + j, sz);
185029Spjd						start[last] = saved_sz + j;
185029Spjd						disp[last] = sz;
185029Spjd
185029Spjd						last++;
185029Spjd						if (last >=
185029Spjd						    U8_UPPER_LIMIT_IN_A_SEQ) {
185029Spjd							last = k;
185029Spjd							goto TURN_STREAM_SAFE;
185029Spjd						}
185029Spjd						j += sz;
185029Spjd					}
185029Spjd
185029Spjd					*state = U8_STATE_COMBINING_MARK;
185029Spjd					sz = i;
185029Spjd					s += l;
185029Spjd
185029Spjd					for (i = 0; i < sz; i++)
185029Spjd						u8s[saved_sz++] = uts[i];
185029Spjd				} else {
185029Spjd					comb_class[last] = i;
185029Spjd					start[last] = saved_sz;
185029Spjd					disp[last] = sz;
185029Spjd					last++;
185029Spjd
185029Spjd					for (i = 0; i < sz; i++)
185029Spjd						u8s[saved_sz++] = *s++;
185029Spjd				}
185029Spjd
185029Spjd				/*
185029Spjd				 * If this is U+0345 COMBINING GREEK
185029Spjd				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
185029Spjd				 * iota subscript, and need to be converted to
185029Spjd				 * uppercase letter, convert it to U+0399 GREEK
185029Spjd				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
185029Spjd				 * i.e., convert to capital adscript form as
185029Spjd				 * specified in the Unicode standard.
185029Spjd				 *
185029Spjd				 * This is the only special case of (ambiguous)
185029Spjd				 * case conversion at combining marks and
185029Spjd				 * probably the standard will never have
185029Spjd				 * anything similar like this in future.
185029Spjd				 */
185029Spjd				if (is_it_toupper && sz >= 2 &&
185029Spjd				    u8s[saved_sz - 2] == 0xCD &&
185029Spjd				    u8s[saved_sz - 1] == 0x85) {
185029Spjd					u8s[saved_sz - 2] = 0xCE;
185029Spjd					u8s[saved_sz - 1] = 0x99;
185029Spjd				}
185029Spjd			}
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * Let's try to ensure a canonical ordering for the collected
185029Spjd		 * combining marks. We do this only if we have collected
185029Spjd		 * at least one more non-Starter. (The decomposition mapping
185029Spjd		 * data tables have fully (and recursively) expanded and
185029Spjd		 * canonically ordered decompositions.)
185029Spjd		 *
185029Spjd		 * The U8_SWAP_COMB_MARKS() convenience macro has some
185029Spjd		 * assumptions and we are meeting the assumptions.
185029Spjd		 */
185029Spjd		last--;
185029Spjd		if (last >= saved_last) {
185029Spjd			for (i = 0; i < last; i++)
185029Spjd				for (j = last; j > i; j--)
185029Spjd					if (comb_class[j] &&
185029Spjd					    comb_class[j - 1] > comb_class[j]) {
185029Spjd						U8_SWAP_COMB_MARKS(j - 1, j);
185029Spjd					}
185029Spjd		}
185029Spjd
185029Spjd		*source = s;
185029Spjd
185029Spjd		if (! canonical_composition) {
185029Spjd			u8s[saved_sz] = '\0';
185029Spjd			return (saved_sz);
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * Now do the canonical composition. Note that we do this
185029Spjd		 * only after a canonical or compatibility decomposition to
185029Spjd		 * finish up NFC or NFKC.
185029Spjd		 */
185029Spjd		sz = do_composition(uv, u8s, comb_class, start, disp, last,
185029Spjd		    &s, slast);
185029Spjd	}
185029Spjd
185029Spjd	*source = s;
185029Spjd
185029Spjd	return ((size_t)sz);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The do_norm_compare() function does string comparion based on Unicode
185029Spjd * simple case mappings and Unicode Normalization definitions.
185029Spjd *
185029Spjd * It does so by collecting a sequence of character at a time and comparing
185029Spjd * the collected sequences from the strings.
185029Spjd *
185029Spjd * The meanings on the return values are the same as the usual strcmp().
185029Spjd */
185029Spjdstatic int
185029Spjddo_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
185029Spjd	int flag, int *errnum)
185029Spjd{
185029Spjd	int result;
185029Spjd	size_t sz1;
185029Spjd	size_t sz2;
185029Spjd	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
185029Spjd	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
185029Spjd	uchar_t *s1last;
185029Spjd	uchar_t *s2last;
185029Spjd	boolean_t is_it_toupper;
185029Spjd	boolean_t is_it_tolower;
185029Spjd	boolean_t canonical_decomposition;
185029Spjd	boolean_t compatibility_decomposition;
185029Spjd	boolean_t canonical_composition;
185029Spjd	u8_normalization_states_t state;
185029Spjd
185029Spjd	s1last = s1 + n1;
185029Spjd	s2last = s2 + n2;
185029Spjd
185029Spjd	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
185029Spjd	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
185029Spjd	canonical_decomposition = flag & U8_CANON_DECOMP;
185029Spjd	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
185029Spjd	canonical_composition = flag & U8_CANON_COMP;
185029Spjd
185029Spjd	while (s1 < s1last && s2 < s2last) {
185029Spjd		/*
185029Spjd		 * If the current character is a 7-bit ASCII and the last
185029Spjd		 * character, or, if the current character and the next
185029Spjd		 * character are both some 7-bit ASCII characters then
185029Spjd		 * we treat the current character as a sequence.
185029Spjd		 *
185029Spjd		 * In any other cases, we need to call collect_a_seq().
185029Spjd		 */
185029Spjd
185029Spjd		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
185029Spjd		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
185029Spjd			if (is_it_toupper)
185029Spjd				u8s1[0] = U8_ASCII_TOUPPER(*s1);
185029Spjd			else if (is_it_tolower)
185029Spjd				u8s1[0] = U8_ASCII_TOLOWER(*s1);
185029Spjd			else
185029Spjd				u8s1[0] = *s1;
185029Spjd			u8s1[1] = '\0';
185029Spjd			sz1 = 1;
185029Spjd			s1++;
185029Spjd		} else {
185029Spjd			state = U8_STATE_START;
185029Spjd			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
185029Spjd			    is_it_toupper, is_it_tolower,
185029Spjd			    canonical_decomposition,
185029Spjd			    compatibility_decomposition,
185029Spjd			    canonical_composition, errnum, &state);
185029Spjd		}
185029Spjd
185029Spjd		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
185029Spjd		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
185029Spjd			if (is_it_toupper)
185029Spjd				u8s2[0] = U8_ASCII_TOUPPER(*s2);
185029Spjd			else if (is_it_tolower)
185029Spjd				u8s2[0] = U8_ASCII_TOLOWER(*s2);
185029Spjd			else
185029Spjd				u8s2[0] = *s2;
185029Spjd			u8s2[1] = '\0';
185029Spjd			sz2 = 1;
185029Spjd			s2++;
185029Spjd		} else {
185029Spjd			state = U8_STATE_START;
185029Spjd			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
185029Spjd			    is_it_toupper, is_it_tolower,
185029Spjd			    canonical_decomposition,
185029Spjd			    compatibility_decomposition,
185029Spjd			    canonical_composition, errnum, &state);
185029Spjd		}
185029Spjd
185029Spjd		/*
185029Spjd		 * Now compare the two characters. If they are the same,
185029Spjd		 * we move on to the next character sequences.
185029Spjd		 */
185029Spjd		if (sz1 == 1 && sz2 == 1) {
185029Spjd			if (*u8s1 > *u8s2)
185029Spjd				return (1);
185029Spjd			if (*u8s1 < *u8s2)
185029Spjd				return (-1);
185029Spjd		} else {
185029Spjd			result = strcmp((const char *)u8s1, (const char *)u8s2);
185029Spjd			if (result != 0)
185029Spjd				return (result);
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * We compared until the end of either or both strings.
185029Spjd	 *
185029Spjd	 * If we reached to or went over the ends for the both, that means
185029Spjd	 * they are the same.
185029Spjd	 *
185029Spjd	 * If we reached only one end, that means the other string has
185029Spjd	 * something which then can be used to determine the return value.
185029Spjd	 */
185029Spjd	if (s1 >= s1last) {
185029Spjd		if (s2 >= s2last)
185029Spjd			return (0);
185029Spjd		return (-1);
185029Spjd	}
185029Spjd	return (1);
185029Spjd}
185029Spjd
185029Spjd/*
185029Spjd * The u8_strcmp() function compares two UTF-8 strings quite similar to
185029Spjd * the strcmp(). For the comparison, however, Unicode Normalization specific
185029Spjd * equivalency and Unicode simple case conversion mappings based equivalency
185029Spjd * can be requested and checked against.
185029Spjd */
185029Spjdint
185029Spjdu8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
185029Spjd		int *errnum)
185029Spjd{
185029Spjd	int f;
185029Spjd	size_t n1;
185029Spjd	size_t n2;
185029Spjd
185029Spjd	*errnum = 0;
185029Spjd
185029Spjd	/*
185029Spjd	 * Check on the requested Unicode version, case conversion, and
185029Spjd	 * normalization flag values.
185029Spjd	 */
185029Spjd
185029Spjd	if (uv > U8_UNICODE_LATEST) {
185029Spjd		*errnum = ERANGE;
185029Spjd		uv = U8_UNICODE_LATEST;
185029Spjd	}
185029Spjd
185029Spjd	if (flag == 0) {
185029Spjd		flag = U8_STRCMP_CS;
185029Spjd	} else {
185029Spjd		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
185029Spjd		    U8_STRCMP_CI_LOWER);
185029Spjd		if (f == 0) {
185029Spjd			flag |= U8_STRCMP_CS;
185029Spjd		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
185029Spjd		    f != U8_STRCMP_CI_LOWER) {
185029Spjd			*errnum = EBADF;
185029Spjd			flag = U8_STRCMP_CS;
185029Spjd		}
185029Spjd
185029Spjd		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
185029Spjd		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
185029Spjd		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
185029Spjd			*errnum = EBADF;
185029Spjd			flag = U8_STRCMP_CS;
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	if (flag == U8_STRCMP_CS) {
185029Spjd		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
185029Spjd	}
185029Spjd
185029Spjd	n1 = strlen(s1);
185029Spjd	n2 = strlen(s2);
185029Spjd	if (n != 0) {
185029Spjd		if (n < n1)
185029Spjd			n1 = n;
185029Spjd		if (n < n2)
185029Spjd			n2 = n;
185029Spjd	}
185029Spjd
185029Spjd	/*
185029Spjd	 * Simple case conversion can be done much faster and so we do
185029Spjd	 * them separately here.
185029Spjd	 */
185029Spjd	if (flag == U8_STRCMP_CI_UPPER) {
185029Spjd		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
185029Spjd		    n1, n2, B_TRUE, errnum));
185029Spjd	} else if (flag == U8_STRCMP_CI_LOWER) {
185029Spjd		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
185029Spjd		    n1, n2, B_FALSE, errnum));
185029Spjd	}
185029Spjd
185029Spjd	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
185029Spjd	    flag, errnum));
185029Spjd}
185029Spjd
185029Spjdsize_t
185029Spjdu8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
185029Spjd	int flag, size_t unicode_version, int *errnum)
185029Spjd{
185029Spjd	int f;
185029Spjd	int sz;
185029Spjd	uchar_t *ib;
185029Spjd	uchar_t *ibtail;
185029Spjd	uchar_t *ob;
185029Spjd	uchar_t *obtail;
185029Spjd	boolean_t do_not_ignore_null;
185029Spjd	boolean_t do_not_ignore_invalid;
185029Spjd	boolean_t is_it_toupper;
185029Spjd	boolean_t is_it_tolower;
185029Spjd	boolean_t canonical_decomposition;
185029Spjd	boolean_t compatibility_decomposition;
185029Spjd	boolean_t canonical_composition;
185029Spjd	size_t ret_val;
185029Spjd	size_t i;
185029Spjd	size_t j;
185029Spjd	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
185029Spjd	u8_normalization_states_t state;
185029Spjd
185029Spjd	if (unicode_version > U8_UNICODE_LATEST) {
185029Spjd		*errnum = ERANGE;
185029Spjd		return ((size_t)-1);
185029Spjd	}
185029Spjd
185029Spjd	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
185029Spjd	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
185029Spjd		*errnum = EBADF;
185029Spjd		return ((size_t)-1);
185029Spjd	}
185029Spjd
185029Spjd	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
185029Spjd	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
185029Spjd	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
185029Spjd		*errnum = EBADF;
185029Spjd		return ((size_t)-1);
185029Spjd	}
185029Spjd
185029Spjd	if (inarray == NULL || *inlen == 0)
185029Spjd		return (0);
185029Spjd
185029Spjd	if (outarray == NULL) {
185029Spjd		*errnum = E2BIG;
185029Spjd		return ((size_t)-1);
185029Spjd	}
185029Spjd
185029Spjd	ib = (uchar_t *)inarray;
185029Spjd	ob = (uchar_t *)outarray;
185029Spjd	ibtail = ib + *inlen;
185029Spjd	obtail = ob + *outlen;
185029Spjd
185029Spjd	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
185029Spjd	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
185029Spjd	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
185029Spjd	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
185029Spjd
185029Spjd	ret_val = 0;
185029Spjd
185029Spjd	/*
185029Spjd	 * If we don't have a normalization flag set, we do the simple case
185029Spjd	 * conversion based text preparation separately below. Text
185029Spjd	 * preparation involving Normalization will be done in the false task
185029Spjd	 * block, again, separately since it will take much more time and
185029Spjd	 * resource than doing simple case conversions.
185029Spjd	 */
185029Spjd	if (f == 0) {
185029Spjd		while (ib < ibtail) {
185029Spjd			if (*ib == '\0' && do_not_ignore_null)
185029Spjd				break;
185029Spjd
185029Spjd			sz = u8_number_of_bytes[*ib];
185029Spjd
185029Spjd			if (sz < 0) {
185029Spjd				if (do_not_ignore_invalid) {
185029Spjd					*errnum = EILSEQ;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				sz = 1;
185029Spjd				ret_val++;
185029Spjd			}
185029Spjd
185029Spjd			if (sz == 1) {
185029Spjd				if (ob >= obtail) {
185029Spjd					*errnum = E2BIG;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				if (is_it_toupper)
185029Spjd					*ob = U8_ASCII_TOUPPER(*ib);
185029Spjd				else if (is_it_tolower)
185029Spjd					*ob = U8_ASCII_TOLOWER(*ib);
185029Spjd				else
185029Spjd					*ob = *ib;
185029Spjd				ib++;
185029Spjd				ob++;
185029Spjd			} else if ((ib + sz) > ibtail) {
185029Spjd				if (do_not_ignore_invalid) {
185029Spjd					*errnum = EINVAL;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				if ((obtail - ob) < (ibtail - ib)) {
185029Spjd					*errnum = E2BIG;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				/*
185029Spjd				 * We treat the remaining incomplete character
185029Spjd				 * bytes as a character.
185029Spjd				 */
185029Spjd				ret_val++;
185029Spjd
185029Spjd				while (ib < ibtail)
185029Spjd					*ob++ = *ib++;
185029Spjd			} else {
185029Spjd				if (is_it_toupper || is_it_tolower) {
185029Spjd					i = do_case_conv(unicode_version, u8s,
185029Spjd					    ib, sz, is_it_toupper);
185029Spjd
185029Spjd					if ((obtail - ob) < i) {
185029Spjd						*errnum = E2BIG;
185029Spjd						ret_val = (size_t)-1;
185029Spjd						break;
185029Spjd					}
185029Spjd
185029Spjd					ib += sz;
185029Spjd
185029Spjd					for (sz = 0; sz < i; sz++)
185029Spjd						*ob++ = u8s[sz];
185029Spjd				} else {
185029Spjd					if ((obtail - ob) < sz) {
185029Spjd						*errnum = E2BIG;
185029Spjd						ret_val = (size_t)-1;
185029Spjd						break;
185029Spjd					}
185029Spjd
185029Spjd					for (i = 0; i < sz; i++)
185029Spjd						*ob++ = *ib++;
185029Spjd				}
185029Spjd			}
185029Spjd		}
185029Spjd	} else {
185029Spjd		canonical_decomposition = flag & U8_CANON_DECOMP;
185029Spjd		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
185029Spjd		canonical_composition = flag & U8_CANON_COMP;
185029Spjd
185029Spjd		while (ib < ibtail) {
185029Spjd			if (*ib == '\0' && do_not_ignore_null)
185029Spjd				break;
185029Spjd
185029Spjd			/*
185029Spjd			 * If the current character is a 7-bit ASCII
185029Spjd			 * character and it is the last character, or,
185029Spjd			 * if the current character is a 7-bit ASCII
185029Spjd			 * character and the next character is also a 7-bit
185029Spjd			 * ASCII character, then, we copy over this
185029Spjd			 * character without going through collect_a_seq().
185029Spjd			 *
185029Spjd			 * In any other cases, we need to look further with
185029Spjd			 * the collect_a_seq() function.
185029Spjd			 */
185029Spjd			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
185029Spjd			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
185029Spjd				if (ob >= obtail) {
185029Spjd					*errnum = E2BIG;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				if (is_it_toupper)
185029Spjd					*ob = U8_ASCII_TOUPPER(*ib);
185029Spjd				else if (is_it_tolower)
185029Spjd					*ob = U8_ASCII_TOLOWER(*ib);
185029Spjd				else
185029Spjd					*ob = *ib;
185029Spjd				ib++;
185029Spjd				ob++;
185029Spjd			} else {
185029Spjd				*errnum = 0;
185029Spjd				state = U8_STATE_START;
185029Spjd
185029Spjd				j = collect_a_seq(unicode_version, u8s,
185029Spjd				    &ib, ibtail,
185029Spjd				    is_it_toupper,
185029Spjd				    is_it_tolower,
185029Spjd				    canonical_decomposition,
185029Spjd				    compatibility_decomposition,
185029Spjd				    canonical_composition,
185029Spjd				    errnum, &state);
185029Spjd
185029Spjd				if (*errnum && do_not_ignore_invalid) {
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				if ((obtail - ob) < j) {
185029Spjd					*errnum = E2BIG;
185029Spjd					ret_val = (size_t)-1;
185029Spjd					break;
185029Spjd				}
185029Spjd
185029Spjd				for (i = 0; i < j; i++)
185029Spjd					*ob++ = u8s[i];
185029Spjd			}
185029Spjd		}
185029Spjd	}
185029Spjd
185029Spjd	*inlen = ibtail - ib;
185029Spjd	*outlen = obtail - ob;
185029Spjd
185029Spjd	return (ret_val);
185029Spjd}