1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28
29/*
30 * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
31 *
32 * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
33 * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
34 * the section 3C man pages.
35 * Interface stability: Committed.
36 */
37
38#include <sys/types.h>
39#ifdef	_KERNEL
40#include <sys/param.h>
41#include <sys/sysmacros.h>
42#include <sys/systm.h>
43#include <sys/debug.h>
44#include <sys/kmem.h>
45#include <sys/sunddi.h>
46#else
47#include <strings.h>
48#endif	/* _KERNEL */
49#include <sys/byteorder.h>
50#include <sys/errno.h>
51#include <sys/u8_textprep.h>
52#include <sys/u8_textprep_data.h>
53
54
55/* The maximum possible number of bytes in a UTF-8 character. */
56#define	U8_MB_CUR_MAX			(4)
57
58/*
59 * The maximum number of bytes needed for a UTF-8 character to cover
60 * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
61 */
62#define	U8_MAX_BYTES_UCS2		(3)
63
64/* The maximum possible number of bytes in a Stream-Safe Text. */
65#define	U8_STREAM_SAFE_TEXT_MAX		(128)
66
67/*
68 * The maximum number of characters in a combining/conjoining sequence and
69 * the actual upperbound limit of a combining/conjoining sequence.
70 */
71#define	U8_MAX_CHARS_A_SEQ		(32)
72#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
73
74/* The combining class value for Starter. */
75#define	U8_COMBINING_CLASS_STARTER	(0)
76
77/*
78 * Some Hangul related macros at below.
79 *
80 * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
81 * Vowels, and optional Trailing consonants in Unicode scalar values.
82 *
83 * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
84 * the actual U+11A8. This is due to that the trailing consonant is optional
85 * and thus we are doing a pre-calculation of subtracting one.
86 *
87 * Each of 19 modern leading consonants has total 588 possible syllables since
88 * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
89 * no trailing consonant case, i.e., 21 x 28 = 588.
90 *
91 * We also have bunch of Hangul related macros at below. Please bear in mind
92 * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
93 * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
94 * Jamo; it just guarantee that it will be most likely.
95 */
96#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
97#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
98
99#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
100#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
101#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
102#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
103#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
104#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
105
106#define	U8_HANGUL_V_COUNT		(21)
107#define	U8_HANGUL_VT_COUNT		(588)
108#define	U8_HANGUL_T_COUNT		(28)
109
110#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
111
112#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
113	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
114	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
115	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
116
117#define	U8_HANGUL_JAMO_L(u) \
118	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
119
120#define	U8_HANGUL_JAMO_V(u) \
121	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
122
123#define	U8_HANGUL_JAMO_T(u) \
124	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
125
126#define	U8_HANGUL_JAMO(u) \
127	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
128
129#define	U8_HANGUL_SYLLABLE(u) \
130	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
131
132#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
133	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
134
135#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
136	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
137
138/* The types of decomposition mappings. */
139#define	U8_DECOMP_BOTH			(0xF5U)
140#define	U8_DECOMP_CANONICAL		(0xF6U)
141
142/* The indicator for 16-bit table. */
143#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
144
145/* The following are some convenience macros. */
146#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
147	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
148		(uint32_t)(b3) & 0x3F;
149
150#define	U8_SIMPLE_SWAP(a, b, t) \
151	(t) = (a); \
152	(a) = (b); \
153	(b) = (t);
154
155#define	U8_ASCII_TOUPPER(c) \
156	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
157
158#define	U8_ASCII_TOLOWER(c) \
159	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
160
161#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
162/*
163 * The following macro assumes that the two characters that are to be
164 * swapped are adjacent to each other and 'a' comes before 'b'.
165 *
166 * If the assumptions are not met, then, the macro will fail.
167 */
168#define	U8_SWAP_COMB_MARKS(a, b) \
169	for (k = 0; k < disp[(a)]; k++) \
170		u8t[k] = u8s[start[(a)] + k]; \
171	for (k = 0; k < disp[(b)]; k++) \
172		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
173	start[(b)] = start[(a)] + disp[(b)]; \
174	for (k = 0; k < disp[(a)]; k++) \
175		u8s[start[(b)] + k] = u8t[k]; \
176	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
177	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
178
179/* The possible states during normalization. */
180typedef enum {
181	U8_STATE_START = 0,
182	U8_STATE_HANGUL_L = 1,
183	U8_STATE_HANGUL_LV = 2,
184	U8_STATE_HANGUL_LVT = 3,
185	U8_STATE_HANGUL_V = 4,
186	U8_STATE_HANGUL_T = 5,
187	U8_STATE_COMBINING_MARK = 6
188} u8_normalization_states_t;
189
190/*
191 * The three vectors at below are used to check bytes of a given UTF-8
192 * character are valid and not containing any malformed byte values.
193 *
194 * We used to have a quite relaxed UTF-8 binary representation but then there
195 * was some security related issues and so the Unicode Consortium defined
196 * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
197 * one more time at the Unicode 3.2. The following three tables are based on
198 * that.
199 */
200
201#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
202
203#define	I_				U8_ILLEGAL_CHAR
204#define	O_				U8_OUT_OF_RANGE_CHAR
205
206const int8_t u8_number_of_bytes[0x100] = {
207	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
208	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
209	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
210	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
211	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215
216/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
217	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
218
219/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
220	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
221
222/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
223	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
224
225/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
226	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
227
228/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
229	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
230
231/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
232	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
233
234/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
235	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
236
237/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
238	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
239};
240
241#undef	I_
242#undef	O_
243
244const uint8_t u8_valid_min_2nd_byte[0x100] = {
245	0,    0,    0,    0,    0,    0,    0,    0,
246	0,    0,    0,    0,    0,    0,    0,    0,
247	0,    0,    0,    0,    0,    0,    0,    0,
248	0,    0,    0,    0,    0,    0,    0,    0,
249	0,    0,    0,    0,    0,    0,    0,    0,
250	0,    0,    0,    0,    0,    0,    0,    0,
251	0,    0,    0,    0,    0,    0,    0,    0,
252	0,    0,    0,    0,    0,    0,    0,    0,
253	0,    0,    0,    0,    0,    0,    0,    0,
254	0,    0,    0,    0,    0,    0,    0,    0,
255	0,    0,    0,    0,    0,    0,    0,    0,
256	0,    0,    0,    0,    0,    0,    0,    0,
257	0,    0,    0,    0,    0,    0,    0,    0,
258	0,    0,    0,    0,    0,    0,    0,    0,
259	0,    0,    0,    0,    0,    0,    0,    0,
260	0,    0,    0,    0,    0,    0,    0,    0,
261	0,    0,    0,    0,    0,    0,    0,    0,
262	0,    0,    0,    0,    0,    0,    0,    0,
263	0,    0,    0,    0,    0,    0,    0,    0,
264	0,    0,    0,    0,    0,    0,    0,    0,
265	0,    0,    0,    0,    0,    0,    0,    0,
266	0,    0,    0,    0,    0,    0,    0,    0,
267	0,    0,    0,    0,    0,    0,    0,    0,
268	0,    0,    0,    0,    0,    0,    0,    0,
269/*	C0    C1    C2    C3    C4    C5    C6    C7    */
270	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
271/*	C8    C9    CA    CB    CC    CD    CE    CF    */
272	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
273/*	D0    D1    D2    D3    D4    D5    D6    D7    */
274	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
275/*	D8    D9    DA    DB    DC    DD    DE    DF    */
276	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
277/*	E0    E1    E2    E3    E4    E5    E6    E7    */
278	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
279/*	E8    E9    EA    EB    EC    ED    EE    EF    */
280	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
281/*	F0    F1    F2    F3    F4    F5    F6    F7    */
282	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
283	0,    0,    0,    0,    0,    0,    0,    0,
284};
285
286const uint8_t u8_valid_max_2nd_byte[0x100] = {
287	0,    0,    0,    0,    0,    0,    0,    0,
288	0,    0,    0,    0,    0,    0,    0,    0,
289	0,    0,    0,    0,    0,    0,    0,    0,
290	0,    0,    0,    0,    0,    0,    0,    0,
291	0,    0,    0,    0,    0,    0,    0,    0,
292	0,    0,    0,    0,    0,    0,    0,    0,
293	0,    0,    0,    0,    0,    0,    0,    0,
294	0,    0,    0,    0,    0,    0,    0,    0,
295	0,    0,    0,    0,    0,    0,    0,    0,
296	0,    0,    0,    0,    0,    0,    0,    0,
297	0,    0,    0,    0,    0,    0,    0,    0,
298	0,    0,    0,    0,    0,    0,    0,    0,
299	0,    0,    0,    0,    0,    0,    0,    0,
300	0,    0,    0,    0,    0,    0,    0,    0,
301	0,    0,    0,    0,    0,    0,    0,    0,
302	0,    0,    0,    0,    0,    0,    0,    0,
303	0,    0,    0,    0,    0,    0,    0,    0,
304	0,    0,    0,    0,    0,    0,    0,    0,
305	0,    0,    0,    0,    0,    0,    0,    0,
306	0,    0,    0,    0,    0,    0,    0,    0,
307	0,    0,    0,    0,    0,    0,    0,    0,
308	0,    0,    0,    0,    0,    0,    0,    0,
309	0,    0,    0,    0,    0,    0,    0,    0,
310	0,    0,    0,    0,    0,    0,    0,    0,
311/*	C0    C1    C2    C3    C4    C5    C6    C7    */
312	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
313/*	C8    C9    CA    CB    CC    CD    CE    CF    */
314	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
315/*	D0    D1    D2    D3    D4    D5    D6    D7    */
316	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
317/*	D8    D9    DA    DB    DC    DD    DE    DF    */
318	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
319/*	E0    E1    E2    E3    E4    E5    E6    E7    */
320	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
321/*	E8    E9    EA    EB    EC    ED    EE    EF    */
322	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
323/*	F0    F1    F2    F3    F4    F5    F6    F7    */
324	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
325	0,    0,    0,    0,    0,    0,    0,    0,
326};
327
328
329/*
330 * The u8_validate() validates on the given UTF-8 character string and
331 * calculate the byte length. It is quite similar to mblen(3C) except that
332 * this will validate against the list of characters if required and
333 * specific to UTF-8 and Unicode.
334 */
335int
336u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
337{
338	uchar_t *ib;
339	uchar_t *ibtail;
340	uchar_t **p;
341	uchar_t *s1;
342	uchar_t *s2;
343	uchar_t f;
344	int sz;
345	size_t i;
346	int ret_val;
347	boolean_t second;
348	boolean_t no_need_to_validate_entire;
349	boolean_t check_additional;
350	boolean_t validate_ucs2_range_only;
351
352	if (! u8str)
353		return (0);
354
355	ib = (uchar_t *)u8str;
356	ibtail = ib + n;
357
358	ret_val = 0;
359
360	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
361	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
362	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
363
364	while (ib < ibtail) {
365		/*
366		 * The first byte of a UTF-8 character tells how many
367		 * bytes will follow for the character. If the first byte
368		 * is an illegal byte value or out of range value, we just
369		 * return -1 with an appropriate error number.
370		 */
371		sz = u8_number_of_bytes[*ib];
372		if (sz == U8_ILLEGAL_CHAR) {
373			*errnum = EILSEQ;
374			return (-1);
375		}
376
377		if (sz == U8_OUT_OF_RANGE_CHAR ||
378		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
379			*errnum = ERANGE;
380			return (-1);
381		}
382
383		/*
384		 * If we don't have enough bytes to check on, that's also
385		 * an error. As you can see, we give illegal byte sequence
386		 * checking higher priority then EINVAL cases.
387		 */
388		if ((ibtail - ib) < sz) {
389			*errnum = EINVAL;
390			return (-1);
391		}
392
393		if (sz == 1) {
394			ib++;
395			ret_val++;
396		} else {
397			/*
398			 * Check on the multi-byte UTF-8 character. For more
399			 * details on this, see comment added for the used
400			 * data structures at the beginning of the file.
401			 */
402			f = *ib++;
403			ret_val++;
404			second = B_TRUE;
405			for (i = 1; i < sz; i++) {
406				if (second) {
407					if (*ib < u8_valid_min_2nd_byte[f] ||
408					    *ib > u8_valid_max_2nd_byte[f]) {
409						*errnum = EILSEQ;
410						return (-1);
411					}
412					second = B_FALSE;
413				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
414					*errnum = EILSEQ;
415					return (-1);
416				}
417				ib++;
418				ret_val++;
419			}
420		}
421
422		if (check_additional) {
423			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
424				s1 = ib - sz;
425				s2 = p[i];
426				while (s1 < ib) {
427					if (*s1 != *s2 || *s2 == '\0')
428						break;
429					s1++;
430					s2++;
431				}
432
433				if (s1 >= ib && *s2 == '\0') {
434					*errnum = EBADF;
435					return (-1);
436				}
437			}
438		}
439
440		if (no_need_to_validate_entire)
441			break;
442	}
443
444	return (ret_val);
445}
446
447/*
448 * The do_case_conv() looks at the mapping tables and returns found
449 * bytes if any. If not found, the input bytes are returned. The function
450 * always terminate the return bytes with a null character assuming that
451 * there are plenty of room to do so.
452 *
453 * The case conversions are simple case conversions mapping a character to
454 * another character as specified in the Unicode data. The byte size of
455 * the mapped character could be different from that of the input character.
456 *
457 * The return value is the byte length of the returned character excluding
458 * the terminating null byte.
459 */
460static size_t
461do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
462{
463	size_t i;
464	uint16_t b1 = 0;
465	uint16_t b2 = 0;
466	uint16_t b3 = 0;
467	uint16_t b3_tbl;
468	uint16_t b3_base;
469	uint16_t b4 = 0;
470	size_t start_id;
471	size_t end_id;
472
473	/*
474	 * At this point, the only possible values for sz are 2, 3, and 4.
475	 * The u8s should point to a vector that is well beyond the size of
476	 * 5 bytes.
477	 */
478	if (sz == 2) {
479		b3 = u8s[0] = s[0];
480		b4 = u8s[1] = s[1];
481	} else if (sz == 3) {
482		b2 = u8s[0] = s[0];
483		b3 = u8s[1] = s[1];
484		b4 = u8s[2] = s[2];
485	} else if (sz == 4) {
486		b1 = u8s[0] = s[0];
487		b2 = u8s[1] = s[1];
488		b3 = u8s[2] = s[2];
489		b4 = u8s[3] = s[3];
490	} else {
491		/* This is not possible but just in case as a fallback. */
492		if (is_it_toupper)
493			*u8s = U8_ASCII_TOUPPER(*s);
494		else
495			*u8s = U8_ASCII_TOLOWER(*s);
496		u8s[1] = '\0';
497
498		return (1);
499	}
500	u8s[sz] = '\0';
501
502	/*
503	 * Let's find out if we have a corresponding character.
504	 */
505	b1 = u8_common_b1_tbl[uv][b1];
506	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
507		return ((size_t)sz);
508
509	b2 = u8_case_common_b2_tbl[uv][b1][b2];
510	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
511		return ((size_t)sz);
512
513	if (is_it_toupper) {
514		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
515		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
516			return ((size_t)sz);
517
518		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
519		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
520
521		/* Either there is no match or an error at the table. */
522		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
523			return ((size_t)sz);
524
525		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
526
527		for (i = 0; start_id < end_id; start_id++)
528			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
529	} else {
530		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
531		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
532			return ((size_t)sz);
533
534		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
535		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
536
537		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
538			return ((size_t)sz);
539
540		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
541
542		for (i = 0; start_id < end_id; start_id++)
543			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
544	}
545
546	/*
547	 * If i is still zero, that means there is no corresponding character.
548	 */
549	if (i == 0)
550		return ((size_t)sz);
551
552	u8s[i] = '\0';
553
554	return (i);
555}
556
557/*
558 * The do_case_compare() function compares the two input strings, s1 and s2,
559 * one character at a time doing case conversions if applicable and return
560 * the comparison result as like strcmp().
561 *
562 * Since, in empirical sense, most of text data are 7-bit ASCII characters,
563 * we treat the 7-bit ASCII characters as a special case trying to yield
564 * faster processing time.
565 */
566static int
567do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
568	size_t n2, boolean_t is_it_toupper, int *errnum)
569{
570	int f;
571	int sz1;
572	int sz2;
573	size_t j;
574	size_t i1;
575	size_t i2;
576	uchar_t u8s1[U8_MB_CUR_MAX + 1];
577	uchar_t u8s2[U8_MB_CUR_MAX + 1];
578
579	i1 = i2 = 0;
580	while (i1 < n1 && i2 < n2) {
581		/*
582		 * Find out what would be the byte length for this UTF-8
583		 * character at string s1 and also find out if this is
584		 * an illegal start byte or not and if so, issue a proper
585		 * error number and yet treat this byte as a character.
586		 */
587		sz1 = u8_number_of_bytes[*s1];
588		if (sz1 < 0) {
589			*errnum = EILSEQ;
590			sz1 = 1;
591		}
592
593		/*
594		 * For 7-bit ASCII characters mainly, we do a quick case
595		 * conversion right at here.
596		 *
597		 * If we don't have enough bytes for this character, issue
598		 * an EINVAL error and use what are available.
599		 *
600		 * If we have enough bytes, find out if there is
601		 * a corresponding uppercase character and if so, copy over
602		 * the bytes for a comparison later. If there is no
603		 * corresponding uppercase character, then, use what we have
604		 * for the comparison.
605		 */
606		if (sz1 == 1) {
607			if (is_it_toupper)
608				u8s1[0] = U8_ASCII_TOUPPER(*s1);
609			else
610				u8s1[0] = U8_ASCII_TOLOWER(*s1);
611			s1++;
612			u8s1[1] = '\0';
613		} else if ((i1 + sz1) > n1) {
614			*errnum = EINVAL;
615			for (j = 0; (i1 + j) < n1; )
616				u8s1[j++] = *s1++;
617			u8s1[j] = '\0';
618		} else {
619			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
620			s1 += sz1;
621		}
622
623		/* Do the same for the string s2. */
624		sz2 = u8_number_of_bytes[*s2];
625		if (sz2 < 0) {
626			*errnum = EILSEQ;
627			sz2 = 1;
628		}
629
630		if (sz2 == 1) {
631			if (is_it_toupper)
632				u8s2[0] = U8_ASCII_TOUPPER(*s2);
633			else
634				u8s2[0] = U8_ASCII_TOLOWER(*s2);
635			s2++;
636			u8s2[1] = '\0';
637		} else if ((i2 + sz2) > n2) {
638			*errnum = EINVAL;
639			for (j = 0; (i2 + j) < n2; )
640				u8s2[j++] = *s2++;
641			u8s2[j] = '\0';
642		} else {
643			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
644			s2 += sz2;
645		}
646
647		/* Now compare the two characters. */
648		if (sz1 == 1 && sz2 == 1) {
649			if (*u8s1 > *u8s2)
650				return (1);
651			if (*u8s1 < *u8s2)
652				return (-1);
653		} else {
654			f = strcmp((const char *)u8s1, (const char *)u8s2);
655			if (f != 0)
656				return (f);
657		}
658
659		/*
660		 * They were the same. Let's move on to the next
661		 * characters then.
662		 */
663		i1 += sz1;
664		i2 += sz2;
665	}
666
667	/*
668	 * We compared until the end of either or both strings.
669	 *
670	 * If we reached to or went over the ends for the both, that means
671	 * they are the same.
672	 *
673	 * If we reached only one of the two ends, that means the other string
674	 * has something which then the fact can be used to determine
675	 * the return value.
676	 */
677	if (i1 >= n1) {
678		if (i2 >= n2)
679			return (0);
680		return (-1);
681	}
682	return (1);
683}
684
685/*
686 * The combining_class() function checks on the given bytes and find out
687 * the corresponding Unicode combining class value. The return value 0 means
688 * it is a Starter. Any illegal UTF-8 character will also be treated as
689 * a Starter.
690 */
691static uchar_t
692combining_class(size_t uv, uchar_t *s, size_t sz)
693{
694	uint16_t b1 = 0;
695	uint16_t b2 = 0;
696	uint16_t b3 = 0;
697	uint16_t b4 = 0;
698
699	if (sz == 1 || sz > 4)
700		return (0);
701
702	if (sz == 2) {
703		b3 = s[0];
704		b4 = s[1];
705	} else if (sz == 3) {
706		b2 = s[0];
707		b3 = s[1];
708		b4 = s[2];
709	} else if (sz == 4) {
710		b1 = s[0];
711		b2 = s[1];
712		b3 = s[2];
713		b4 = s[3];
714	}
715
716	b1 = u8_common_b1_tbl[uv][b1];
717	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
718		return (0);
719
720	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
721	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
722		return (0);
723
724	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
725	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
726		return (0);
727
728	return (u8_combining_class_b4_tbl[uv][b3][b4]);
729}
730
731/*
732 * The do_decomp() function finds out a matching decomposition if any
733 * and return. If there is no match, the input bytes are copied and returned.
734 * The function also checks if there is a Hangul, decomposes it if necessary
735 * and returns.
736 *
737 * To save time, a single byte 7-bit ASCII character should be handled by
738 * the caller.
739 *
740 * The function returns the number of bytes returned sans always terminating
741 * the null byte. It will also return a state that will tell if there was
742 * a Hangul character decomposed which then will be used by the caller.
743 */
744static size_t
745do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
746	boolean_t canonical_decomposition, u8_normalization_states_t *state)
747{
748	uint16_t b1 = 0;
749	uint16_t b2 = 0;
750	uint16_t b3 = 0;
751	uint16_t b3_tbl;
752	uint16_t b3_base;
753	uint16_t b4 = 0;
754	size_t start_id;
755	size_t end_id;
756	size_t i;
757	uint32_t u1;
758
759	if (sz == 2) {
760		b3 = u8s[0] = s[0];
761		b4 = u8s[1] = s[1];
762		u8s[2] = '\0';
763	} else if (sz == 3) {
764		/* Convert it to a Unicode scalar value. */
765		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
766
767		/*
768		 * If this is a Hangul syllable, we decompose it into
769		 * a leading consonant, a vowel, and an optional trailing
770		 * consonant and then return.
771		 */
772		if (U8_HANGUL_SYLLABLE(u1)) {
773			u1 -= U8_HANGUL_SYL_FIRST;
774
775			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
776			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
777			    / U8_HANGUL_T_COUNT;
778			b3 = u1 % U8_HANGUL_T_COUNT;
779
780			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
781			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
782			if (b3) {
783				b3 += U8_HANGUL_JAMO_T_FIRST;
784				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
785
786				u8s[9] = '\0';
787				*state = U8_STATE_HANGUL_LVT;
788				return (9);
789			}
790
791			u8s[6] = '\0';
792			*state = U8_STATE_HANGUL_LV;
793			return (6);
794		}
795
796		b2 = u8s[0] = s[0];
797		b3 = u8s[1] = s[1];
798		b4 = u8s[2] = s[2];
799		u8s[3] = '\0';
800
801		/*
802		 * If this is a Hangul Jamo, we know there is nothing
803		 * further that we can decompose.
804		 */
805		if (U8_HANGUL_JAMO_L(u1)) {
806			*state = U8_STATE_HANGUL_L;
807			return (3);
808		}
809
810		if (U8_HANGUL_JAMO_V(u1)) {
811			if (*state == U8_STATE_HANGUL_L)
812				*state = U8_STATE_HANGUL_LV;
813			else
814				*state = U8_STATE_HANGUL_V;
815			return (3);
816		}
817
818		if (U8_HANGUL_JAMO_T(u1)) {
819			if (*state == U8_STATE_HANGUL_LV)
820				*state = U8_STATE_HANGUL_LVT;
821			else
822				*state = U8_STATE_HANGUL_T;
823			return (3);
824		}
825	} else if (sz == 4) {
826		b1 = u8s[0] = s[0];
827		b2 = u8s[1] = s[1];
828		b3 = u8s[2] = s[2];
829		b4 = u8s[3] = s[3];
830		u8s[4] = '\0';
831	} else {
832		/*
833		 * This is a fallback and should not happen if the function
834		 * was called properly.
835		 */
836		u8s[0] = s[0];
837		u8s[1] = '\0';
838		*state = U8_STATE_START;
839		return (1);
840	}
841
842	/*
843	 * At this point, this rountine does not know what it would get.
844	 * The caller should sort it out if the state isn't a Hangul one.
845	 */
846	*state = U8_STATE_START;
847
848	/* Try to find matching decomposition mapping byte sequence. */
849	b1 = u8_common_b1_tbl[uv][b1];
850	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
851		return ((size_t)sz);
852
853	b2 = u8_decomp_b2_tbl[uv][b1][b2];
854	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
855		return ((size_t)sz);
856
857	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
858	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
859		return ((size_t)sz);
860
861	/*
862	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
863	 * which is 0x8000, this means we couldn't fit the mappings into
864	 * the cardinality of a unsigned byte.
865	 */
866	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
867		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
868		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
869		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
870	} else {
871		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
872		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
873	}
874
875	/* This also means there wasn't any matching decomposition. */
876	if (start_id >= end_id)
877		return ((size_t)sz);
878
879	/*
880	 * The final table for decomposition mappings has three types of
881	 * byte sequences depending on whether a mapping is for compatibility
882	 * decomposition, canonical decomposition, or both like the following:
883	 *
884	 * (1) Compatibility decomposition mappings:
885	 *
886	 *	+---+---+-...-+---+
887	 *	| B0| B1| ... | Bm|
888	 *	+---+---+-...-+---+
889	 *
890	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
891	 *
892	 * (2) Canonical decomposition mappings:
893	 *
894	 *	+---+---+---+-...-+---+
895	 *	| T | b0| b1| ... | bn|
896	 *	+---+---+---+-...-+---+
897	 *
898	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
899	 *
900	 * (3) Both mappings:
901	 *
902	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
903	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
904	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
905	 *
906	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
907	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
908	 *	compatibility mapping bytes.
909	 *
910	 * Note that compatibility decomposition means doing recursive
911	 * decompositions using both compatibility decomposition mappings and
912	 * canonical decomposition mappings. On the other hand, canonical
913	 * decomposition means doing recursive decompositions using only
914	 * canonical decomposition mappings. Since the table we have has gone
915	 * through the recursions already, we do not need to do so during
916	 * runtime, i.e., the table has been completely flattened out
917	 * already.
918	 */
919
920	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
921
922	/* Get the type, T, of the byte sequence. */
923	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
924
925	/*
926	 * If necessary, adjust start_id, end_id, or both. Note that if
927	 * this is compatibility decomposition mapping, there is no
928	 * adjustment.
929	 */
930	if (canonical_decomposition) {
931		/* Is the mapping only for compatibility decomposition? */
932		if (b1 < U8_DECOMP_BOTH)
933			return ((size_t)sz);
934
935		start_id++;
936
937		if (b1 == U8_DECOMP_BOTH) {
938			end_id = start_id +
939			    u8_decomp_final_tbl[uv][b3_base + start_id];
940			start_id++;
941		}
942	} else {
943		/*
944		 * Unless this is a compatibility decomposition mapping,
945		 * we adjust the start_id.
946		 */
947		if (b1 == U8_DECOMP_BOTH) {
948			start_id++;
949			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
950		} else if (b1 == U8_DECOMP_CANONICAL) {
951			start_id++;
952		}
953	}
954
955	for (i = 0; start_id < end_id; start_id++)
956		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
957	u8s[i] = '\0';
958
959	return (i);
960}
961
962/*
963 * The find_composition_start() function uses the character bytes given and
964 * find out the matching composition mappings if any and return the address
965 * to the composition mappings as explained in the do_composition().
966 */
967static uchar_t *
968find_composition_start(size_t uv, uchar_t *s, size_t sz)
969{
970	uint16_t b1 = 0;
971	uint16_t b2 = 0;
972	uint16_t b3 = 0;
973	uint16_t b3_tbl;
974	uint16_t b3_base;
975	uint16_t b4 = 0;
976	size_t start_id;
977	size_t end_id;
978
979	if (sz == 1) {
980		b4 = s[0];
981	} else if (sz == 2) {
982		b3 = s[0];
983		b4 = s[1];
984	} else if (sz == 3) {
985		b2 = s[0];
986		b3 = s[1];
987		b4 = s[2];
988	} else if (sz == 4) {
989		b1 = s[0];
990		b2 = s[1];
991		b3 = s[2];
992		b4 = s[3];
993	} else {
994		/*
995		 * This is a fallback and should not happen if the function
996		 * was called properly.
997		 */
998		return (NULL);
999	}
1000
1001	b1 = u8_composition_b1_tbl[uv][b1];
1002	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1003		return (NULL);
1004
1005	b2 = u8_composition_b2_tbl[uv][b1][b2];
1006	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1007		return (NULL);
1008
1009	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1010	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1011		return (NULL);
1012
1013	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1014		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1015		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1016		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1017	} else {
1018		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1019		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1020	}
1021
1022	if (start_id >= end_id)
1023		return (NULL);
1024
1025	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1026
1027	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1028}
1029
1030/*
1031 * The blocked() function checks on the combining class values of previous
1032 * characters in this sequence and return whether it is blocked or not.
1033 */
1034static boolean_t
1035blocked(uchar_t *comb_class, size_t last)
1036{
1037	uchar_t my_comb_class;
1038	size_t i;
1039
1040	my_comb_class = comb_class[last];
1041	for (i = 1; i < last; i++)
1042		if (comb_class[i] >= my_comb_class ||
1043		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1044			return (B_TRUE);
1045
1046	return (B_FALSE);
1047}
1048
1049/*
1050 * The do_composition() reads the character string pointed by 's' and
1051 * do necessary canonical composition and then copy over the result back to
1052 * the 's'.
1053 *
1054 * The input argument 's' cannot contain more than 32 characters.
1055 */
1056static size_t
1057do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1058	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1059{
1060	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1061	uchar_t tc[U8_MB_CUR_MAX];
1062	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1063	size_t saved_marks_count;
1064	uchar_t *p;
1065	uchar_t *saved_p;
1066	uchar_t *q;
1067	size_t i;
1068	size_t saved_i;
1069	size_t j;
1070	size_t k;
1071	size_t l;
1072	size_t C;
1073	size_t saved_l;
1074	size_t size;
1075	uint32_t u1;
1076	uint32_t u2;
1077	boolean_t match_not_found = B_TRUE;
1078
1079	/*
1080	 * This should never happen unless the callers are doing some strange
1081	 * and unexpected things.
1082	 *
1083	 * The "last" is the index pointing to the last character not last + 1.
1084	 */
1085	if (last >= U8_MAX_CHARS_A_SEQ)
1086		last = U8_UPPER_LIMIT_IN_A_SEQ;
1087
1088	for (i = l = 0; i <= last; i++) {
1089		/*
1090		 * The last or any non-Starters at the beginning, we don't
1091		 * have any chance to do composition and so we just copy them
1092		 * to the temporary buffer.
1093		 */
1094		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1095SAVE_THE_CHAR:
1096			p = s + start[i];
1097			size = disp[i];
1098			for (k = 0; k < size; k++)
1099				t[l++] = *p++;
1100			continue;
1101		}
1102
1103		/*
1104		 * If this could be a start of Hangul Jamos, then, we try to
1105		 * conjoin them.
1106		 */
1107		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1108			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1109			    s[start[i] + 1], s[start[i] + 2]);
1110			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1111			    s[start[i] + 4], s[start[i] + 5]);
1112
1113			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1114				u1 -= U8_HANGUL_JAMO_L_FIRST;
1115				u2 -= U8_HANGUL_JAMO_V_FIRST;
1116				u1 = U8_HANGUL_SYL_FIRST +
1117				    (u1 * U8_HANGUL_V_COUNT + u2) *
1118				    U8_HANGUL_T_COUNT;
1119
1120				i += 2;
1121				if (i <= last) {
1122					U8_PUT_3BYTES_INTO_UTF32(u2,
1123					    s[start[i]], s[start[i] + 1],
1124					    s[start[i] + 2]);
1125
1126					if (U8_HANGUL_JAMO_T(u2)) {
1127						u1 += u2 -
1128						    U8_HANGUL_JAMO_T_FIRST;
1129						i++;
1130					}
1131				}
1132
1133				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1134				i--;
1135				l += 3;
1136				continue;
1137			}
1138		}
1139
1140		/*
1141		 * Let's then find out if this Starter has composition
1142		 * mapping.
1143		 */
1144		p = find_composition_start(uv, s + start[i], disp[i]);
1145		if (p == NULL)
1146			goto SAVE_THE_CHAR;
1147
1148		/*
1149		 * We have a Starter with composition mapping and the next
1150		 * character is a non-Starter. Let's try to find out if
1151		 * we can do composition.
1152		 */
1153
1154		saved_p = p;
1155		saved_i = i;
1156		saved_l = l;
1157		saved_marks_count = 0;
1158
1159TRY_THE_NEXT_MARK:
1160		q = s + start[++i];
1161		size = disp[i];
1162
1163		/*
1164		 * The next for() loop compares the non-Starter pointed by
1165		 * 'q' with the possible (joinable) characters pointed by 'p'.
1166		 *
1167		 * The composition final table entry pointed by the 'p'
1168		 * looks like the following:
1169		 *
1170		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1171		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1172		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1173		 *
1174		 * where C is the count byte indicating the number of
1175		 * mapping pairs where each pair would be look like
1176		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1177		 * character of a canonical decomposition and the B0-Bm are
1178		 * the bytes of a matching composite character. The F is
1179		 * a filler byte after each character as the separator.
1180		 */
1181
1182		match_not_found = B_TRUE;
1183
1184		for (C = *p++; C > 0; C--) {
1185			for (k = 0; k < size; p++, k++)
1186				if (*p != q[k])
1187					break;
1188
1189			/* Have we found it? */
1190			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1191				match_not_found = B_FALSE;
1192
1193				l = saved_l;
1194
1195				while (*++p != U8_TBL_ELEMENT_FILLER)
1196					t[l++] = *p;
1197
1198				break;
1199			}
1200
1201			/* We didn't find; skip to the next pair. */
1202			if (*p != U8_TBL_ELEMENT_FILLER)
1203				while (*++p != U8_TBL_ELEMENT_FILLER)
1204					;
1205			while (*++p != U8_TBL_ELEMENT_FILLER)
1206				;
1207			p++;
1208		}
1209
1210		/*
1211		 * If there was no match, we will need to save the combining
1212		 * mark for later appending. After that, if the next one
1213		 * is a non-Starter and not blocked, then, we try once
1214		 * again to do composition with the next non-Starter.
1215		 *
1216		 * If there was no match and this was a Starter, then,
1217		 * this is a new start.
1218		 *
1219		 * If there was a match and a composition done and we have
1220		 * more to check on, then, we retrieve a new composition final
1221		 * table entry for the composite and then try to do the
1222		 * composition again.
1223		 */
1224
1225		if (match_not_found) {
1226			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1227				i--;
1228				goto SAVE_THE_CHAR;
1229			}
1230
1231			saved_marks[saved_marks_count++] = i;
1232		}
1233
1234		if (saved_l == l) {
1235			while (i < last) {
1236				if (blocked(comb_class, i + 1))
1237					saved_marks[saved_marks_count++] = ++i;
1238				else
1239					break;
1240			}
1241			if (i < last) {
1242				p = saved_p;
1243				goto TRY_THE_NEXT_MARK;
1244			}
1245		} else if (i < last) {
1246			p = find_composition_start(uv, t + saved_l,
1247			    l - saved_l);
1248			if (p != NULL) {
1249				saved_p = p;
1250				goto TRY_THE_NEXT_MARK;
1251			}
1252		}
1253
1254		/*
1255		 * There is no more composition possible.
1256		 *
1257		 * If there was no composition what so ever then we copy
1258		 * over the original Starter and then append any non-Starters
1259		 * remaining at the target string sequentially after that.
1260		 */
1261
1262		if (saved_l == l) {
1263			p = s + start[saved_i];
1264			size = disp[saved_i];
1265			for (j = 0; j < size; j++)
1266				t[l++] = *p++;
1267		}
1268
1269		for (k = 0; k < saved_marks_count; k++) {
1270			p = s + start[saved_marks[k]];
1271			size = disp[saved_marks[k]];
1272			for (j = 0; j < size; j++)
1273				t[l++] = *p++;
1274		}
1275	}
1276
1277	/*
1278	 * If the last character is a Starter and if we have a character
1279	 * (possibly another Starter) that can be turned into a composite,
1280	 * we do so and we do so until there is no more of composition
1281	 * possible.
1282	 */
1283	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1284		p = *os;
1285		saved_l = l - disp[last];
1286
1287		while (p < oslast) {
1288			size = u8_number_of_bytes[*p];
1289			if (size <= 1 || (p + size) > oslast)
1290				break;
1291
1292			saved_p = p;
1293
1294			for (i = 0; i < size; i++)
1295				tc[i] = *p++;
1296
1297			q = find_composition_start(uv, t + saved_l,
1298			    l - saved_l);
1299			if (q == NULL) {
1300				p = saved_p;
1301				break;
1302			}
1303
1304			match_not_found = B_TRUE;
1305
1306			for (C = *q++; C > 0; C--) {
1307				for (k = 0; k < size; q++, k++)
1308					if (*q != tc[k])
1309						break;
1310
1311				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1312					match_not_found = B_FALSE;
1313
1314					l = saved_l;
1315
1316					while (*++q != U8_TBL_ELEMENT_FILLER) {
1317						/*
1318						 * This is practically
1319						 * impossible but we don't
1320						 * want to take any chances.
1321						 */
1322						if (l >=
1323						    U8_STREAM_SAFE_TEXT_MAX) {
1324							p = saved_p;
1325							goto SAFE_RETURN;
1326						}
1327						t[l++] = *q;
1328					}
1329
1330					break;
1331				}
1332
1333				if (*q != U8_TBL_ELEMENT_FILLER)
1334					while (*++q != U8_TBL_ELEMENT_FILLER)
1335						;
1336				while (*++q != U8_TBL_ELEMENT_FILLER)
1337					;
1338				q++;
1339			}
1340
1341			if (match_not_found) {
1342				p = saved_p;
1343				break;
1344			}
1345		}
1346SAFE_RETURN:
1347		*os = p;
1348	}
1349
1350	/*
1351	 * Now we copy over the temporary string to the target string.
1352	 * Since composition always reduces the number of characters or
1353	 * the number of characters stay, we don't need to worry about
1354	 * the buffer overflow here.
1355	 */
1356	for (i = 0; i < l; i++)
1357		s[i] = t[i];
1358	s[l] = '\0';
1359
1360	return (l);
1361}
1362
1363/*
1364 * The collect_a_seq() function checks on the given string s, collect
1365 * a sequence of characters at u8s, and return the sequence. While it collects
1366 * a sequence, it also applies case conversion, canonical or compatibility
1367 * decomposition, canonical decomposition, or some or all of them and
1368 * in that order.
1369 *
1370 * The collected sequence cannot be bigger than 32 characters since if
1371 * it is having more than 31 characters, the sequence will be terminated
1372 * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1373 * a Stream-Safe Text. The collected sequence is always terminated with
1374 * a null byte and the return value is the byte length of the sequence
1375 * including 0. The return value does not include the terminating
1376 * null byte.
1377 */
1378static size_t
1379collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1380	boolean_t is_it_toupper,
1381	boolean_t is_it_tolower,
1382	boolean_t canonical_decomposition,
1383	boolean_t compatibility_decomposition,
1384	boolean_t canonical_composition,
1385	int *errnum, u8_normalization_states_t *state)
1386{
1387	uchar_t *s;
1388	int sz;
1389	int saved_sz;
1390	size_t i;
1391	size_t j;
1392	size_t k;
1393	size_t l;
1394	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1395	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1396	uchar_t start[U8_MAX_CHARS_A_SEQ];
1397	uchar_t u8t[U8_MB_CUR_MAX];
1398	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1399	uchar_t tc;
1400	size_t last;
1401	size_t saved_last;
1402	uint32_t u1;
1403
1404	/*
1405	 * Save the source string pointer which we will return a changed
1406	 * pointer if we do processing.
1407	 */
1408	s = *source;
1409
1410	/*
1411	 * The following is a fallback for just in case callers are not
1412	 * checking the string boundaries before the calling.
1413	 */
1414	if (s >= slast) {
1415		u8s[0] = '\0';
1416
1417		return (0);
1418	}
1419
1420	/*
1421	 * As the first thing, let's collect a character and do case
1422	 * conversion if necessary.
1423	 */
1424
1425	sz = u8_number_of_bytes[*s];
1426
1427	if (sz < 0) {
1428		*errnum = EILSEQ;
1429
1430		u8s[0] = *s++;
1431		u8s[1] = '\0';
1432
1433		*source = s;
1434
1435		return (1);
1436	}
1437
1438	if (sz == 1) {
1439		if (is_it_toupper)
1440			u8s[0] = U8_ASCII_TOUPPER(*s);
1441		else if (is_it_tolower)
1442			u8s[0] = U8_ASCII_TOLOWER(*s);
1443		else
1444			u8s[0] = *s;
1445		s++;
1446		u8s[1] = '\0';
1447	} else if ((s + sz) > slast) {
1448		*errnum = EINVAL;
1449
1450		for (i = 0; s < slast; )
1451			u8s[i++] = *s++;
1452		u8s[i] = '\0';
1453
1454		*source = s;
1455
1456		return (i);
1457	} else {
1458		if (is_it_toupper || is_it_tolower) {
1459			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1460			s += sz;
1461			sz = i;
1462		} else {
1463			for (i = 0; i < sz; )
1464				u8s[i++] = *s++;
1465			u8s[i] = '\0';
1466		}
1467	}
1468
1469	/*
1470	 * And then canonical/compatibility decomposition followed by
1471	 * an optional canonical composition. Please be noted that
1472	 * canonical composition is done only when a decomposition is
1473	 * done.
1474	 */
1475	if (canonical_decomposition || compatibility_decomposition) {
1476		if (sz == 1) {
1477			*state = U8_STATE_START;
1478
1479			saved_sz = 1;
1480
1481			comb_class[0] = 0;
1482			start[0] = 0;
1483			disp[0] = 1;
1484
1485			last = 1;
1486		} else {
1487			saved_sz = do_decomp(uv, u8s, u8s, sz,
1488			    canonical_decomposition, state);
1489
1490			last = 0;
1491
1492			for (i = 0; i < saved_sz; ) {
1493				sz = u8_number_of_bytes[u8s[i]];
1494
1495				comb_class[last] = combining_class(uv,
1496				    u8s + i, sz);
1497				start[last] = i;
1498				disp[last] = sz;
1499
1500				last++;
1501				i += sz;
1502			}
1503
1504			/*
1505			 * Decomposition yields various Hangul related
1506			 * states but not on combining marks. We need to
1507			 * find out at here by checking on the last
1508			 * character.
1509			 */
1510			if (*state == U8_STATE_START) {
1511				if (comb_class[last - 1])
1512					*state = U8_STATE_COMBINING_MARK;
1513			}
1514		}
1515
1516		saved_last = last;
1517
1518		while (s < slast) {
1519			sz = u8_number_of_bytes[*s];
1520
1521			/*
1522			 * If this is an illegal character, an incomplete
1523			 * character, or an 7-bit ASCII Starter character,
1524			 * then we have collected a sequence; break and let
1525			 * the next call deal with the two cases.
1526			 *
1527			 * Note that this is okay only if you are using this
1528			 * function with a fixed length string, not on
1529			 * a buffer with multiple calls of one chunk at a time.
1530			 */
1531			if (sz <= 1) {
1532				break;
1533			} else if ((s + sz) > slast) {
1534				break;
1535			} else {
1536				/*
1537				 * If the previous character was a Hangul Jamo
1538				 * and this character is a Hangul Jamo that
1539				 * can be conjoined, we collect the Jamo.
1540				 */
1541				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1542					U8_PUT_3BYTES_INTO_UTF32(u1,
1543					    *s, *(s + 1), *(s + 2));
1544
1545					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1546					    u1)) {
1547						i = 0;
1548						*state = U8_STATE_HANGUL_LV;
1549						goto COLLECT_A_HANGUL;
1550					}
1551
1552					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1553					    u1)) {
1554						i = 0;
1555						*state = U8_STATE_HANGUL_LVT;
1556						goto COLLECT_A_HANGUL;
1557					}
1558				}
1559
1560				/*
1561				 * Regardless of whatever it was, if this is
1562				 * a Starter, we don't collect the character
1563				 * since that's a new start and we will deal
1564				 * with it at the next time.
1565				 */
1566				i = combining_class(uv, s, sz);
1567				if (i == U8_COMBINING_CLASS_STARTER)
1568					break;
1569
1570				/*
1571				 * We know the current character is a combining
1572				 * mark. If the previous character wasn't
1573				 * a Starter (not Hangul) or a combining mark,
1574				 * then, we don't collect this combining mark.
1575				 */
1576				if (*state != U8_STATE_START &&
1577				    *state != U8_STATE_COMBINING_MARK)
1578					break;
1579
1580				*state = U8_STATE_COMBINING_MARK;
1581COLLECT_A_HANGUL:
1582				/*
1583				 * If we collected a Starter and combining
1584				 * marks up to 30, i.e., total 31 characters,
1585				 * then, we terminate this degenerately long
1586				 * combining sequence with a U+034F COMBINING
1587				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1588				 * UTF-8 and turn this into a Stream-Safe
1589				 * Text. This will be extremely rare but
1590				 * possible.
1591				 *
1592				 * The following will also guarantee that
1593				 * we are not writing more than 32 characters
1594				 * plus a NULL at u8s[].
1595				 */
1596				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1597TURN_STREAM_SAFE:
1598					*state = U8_STATE_START;
1599					comb_class[last] = 0;
1600					start[last] = saved_sz;
1601					disp[last] = 2;
1602					last++;
1603
1604					u8s[saved_sz++] = 0xCD;
1605					u8s[saved_sz++] = 0x8F;
1606
1607					break;
1608				}
1609
1610				/*
1611				 * Some combining marks also do decompose into
1612				 * another combining mark or marks.
1613				 */
1614				if (*state == U8_STATE_COMBINING_MARK) {
1615					k = last;
1616					l = sz;
1617					i = do_decomp(uv, uts, s, sz,
1618					    canonical_decomposition, state);
1619					for (j = 0; j < i; ) {
1620						sz = u8_number_of_bytes[uts[j]];
1621
1622						comb_class[last] =
1623						    combining_class(uv,
1624						    uts + j, sz);
1625						start[last] = saved_sz + j;
1626						disp[last] = sz;
1627
1628						last++;
1629						if (last >=
1630						    U8_UPPER_LIMIT_IN_A_SEQ) {
1631							last = k;
1632							goto TURN_STREAM_SAFE;
1633						}
1634						j += sz;
1635					}
1636
1637					*state = U8_STATE_COMBINING_MARK;
1638					sz = i;
1639					s += l;
1640
1641					for (i = 0; i < sz; i++)
1642						u8s[saved_sz++] = uts[i];
1643				} else {
1644					comb_class[last] = i;
1645					start[last] = saved_sz;
1646					disp[last] = sz;
1647					last++;
1648
1649					for (i = 0; i < sz; i++)
1650						u8s[saved_sz++] = *s++;
1651				}
1652
1653				/*
1654				 * If this is U+0345 COMBINING GREEK
1655				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1656				 * iota subscript, and need to be converted to
1657				 * uppercase letter, convert it to U+0399 GREEK
1658				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1659				 * i.e., convert to capital adscript form as
1660				 * specified in the Unicode standard.
1661				 *
1662				 * This is the only special case of (ambiguous)
1663				 * case conversion at combining marks and
1664				 * probably the standard will never have
1665				 * anything similar like this in future.
1666				 */
1667				if (is_it_toupper && sz >= 2 &&
1668				    u8s[saved_sz - 2] == 0xCD &&
1669				    u8s[saved_sz - 1] == 0x85) {
1670					u8s[saved_sz - 2] = 0xCE;
1671					u8s[saved_sz - 1] = 0x99;
1672				}
1673			}
1674		}
1675
1676		/*
1677		 * Let's try to ensure a canonical ordering for the collected
1678		 * combining marks. We do this only if we have collected
1679		 * at least one more non-Starter. (The decomposition mapping
1680		 * data tables have fully (and recursively) expanded and
1681		 * canonically ordered decompositions.)
1682		 *
1683		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1684		 * assumptions and we are meeting the assumptions.
1685		 */
1686		last--;
1687		if (last >= saved_last) {
1688			for (i = 0; i < last; i++)
1689				for (j = last; j > i; j--)
1690					if (comb_class[j] &&
1691					    comb_class[j - 1] > comb_class[j]) {
1692						U8_SWAP_COMB_MARKS(j - 1, j);
1693					}
1694		}
1695
1696		*source = s;
1697
1698		if (! canonical_composition) {
1699			u8s[saved_sz] = '\0';
1700			return (saved_sz);
1701		}
1702
1703		/*
1704		 * Now do the canonical composition. Note that we do this
1705		 * only after a canonical or compatibility decomposition to
1706		 * finish up NFC or NFKC.
1707		 */
1708		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1709		    &s, slast);
1710	}
1711
1712	*source = s;
1713
1714	return ((size_t)sz);
1715}
1716
1717/*
1718 * The do_norm_compare() function does string comparion based on Unicode
1719 * simple case mappings and Unicode Normalization definitions.
1720 *
1721 * It does so by collecting a sequence of character at a time and comparing
1722 * the collected sequences from the strings.
1723 *
1724 * The meanings on the return values are the same as the usual strcmp().
1725 */
1726static int
1727do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1728	int flag, int *errnum)
1729{
1730	int result;
1731	size_t sz1;
1732	size_t sz2;
1733	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1734	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1735	uchar_t *s1last;
1736	uchar_t *s2last;
1737	boolean_t is_it_toupper;
1738	boolean_t is_it_tolower;
1739	boolean_t canonical_decomposition;
1740	boolean_t compatibility_decomposition;
1741	boolean_t canonical_composition;
1742	u8_normalization_states_t state;
1743
1744	s1last = s1 + n1;
1745	s2last = s2 + n2;
1746
1747	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1748	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1749	canonical_decomposition = flag & U8_CANON_DECOMP;
1750	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1751	canonical_composition = flag & U8_CANON_COMP;
1752
1753	while (s1 < s1last && s2 < s2last) {
1754		/*
1755		 * If the current character is a 7-bit ASCII and the last
1756		 * character, or, if the current character and the next
1757		 * character are both some 7-bit ASCII characters then
1758		 * we treat the current character as a sequence.
1759		 *
1760		 * In any other cases, we need to call collect_a_seq().
1761		 */
1762
1763		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1764		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1765			if (is_it_toupper)
1766				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1767			else if (is_it_tolower)
1768				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1769			else
1770				u8s1[0] = *s1;
1771			u8s1[1] = '\0';
1772			sz1 = 1;
1773			s1++;
1774		} else {
1775			state = U8_STATE_START;
1776			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1777			    is_it_toupper, is_it_tolower,
1778			    canonical_decomposition,
1779			    compatibility_decomposition,
1780			    canonical_composition, errnum, &state);
1781		}
1782
1783		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1784		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1785			if (is_it_toupper)
1786				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1787			else if (is_it_tolower)
1788				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1789			else
1790				u8s2[0] = *s2;
1791			u8s2[1] = '\0';
1792			sz2 = 1;
1793			s2++;
1794		} else {
1795			state = U8_STATE_START;
1796			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1797			    is_it_toupper, is_it_tolower,
1798			    canonical_decomposition,
1799			    compatibility_decomposition,
1800			    canonical_composition, errnum, &state);
1801		}
1802
1803		/*
1804		 * Now compare the two characters. If they are the same,
1805		 * we move on to the next character sequences.
1806		 */
1807		if (sz1 == 1 && sz2 == 1) {
1808			if (*u8s1 > *u8s2)
1809				return (1);
1810			if (*u8s1 < *u8s2)
1811				return (-1);
1812		} else {
1813			result = strcmp((const char *)u8s1, (const char *)u8s2);
1814			if (result != 0)
1815				return (result);
1816		}
1817	}
1818
1819	/*
1820	 * We compared until the end of either or both strings.
1821	 *
1822	 * If we reached to or went over the ends for the both, that means
1823	 * they are the same.
1824	 *
1825	 * If we reached only one end, that means the other string has
1826	 * something which then can be used to determine the return value.
1827	 */
1828	if (s1 >= s1last) {
1829		if (s2 >= s2last)
1830			return (0);
1831		return (-1);
1832	}
1833	return (1);
1834}
1835
1836/*
1837 * The u8_strcmp() function compares two UTF-8 strings quite similar to
1838 * the strcmp(). For the comparison, however, Unicode Normalization specific
1839 * equivalency and Unicode simple case conversion mappings based equivalency
1840 * can be requested and checked against.
1841 */
1842int
1843u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1844		int *errnum)
1845{
1846	int f;
1847	size_t n1;
1848	size_t n2;
1849
1850	*errnum = 0;
1851
1852	/*
1853	 * Check on the requested Unicode version, case conversion, and
1854	 * normalization flag values.
1855	 */
1856
1857	if (uv > U8_UNICODE_LATEST) {
1858		*errnum = ERANGE;
1859		uv = U8_UNICODE_LATEST;
1860	}
1861
1862	if (flag == 0) {
1863		flag = U8_STRCMP_CS;
1864	} else {
1865		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1866		    U8_STRCMP_CI_LOWER);
1867		if (f == 0) {
1868			flag |= U8_STRCMP_CS;
1869		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1870		    f != U8_STRCMP_CI_LOWER) {
1871			*errnum = EBADF;
1872			flag = U8_STRCMP_CS;
1873		}
1874
1875		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1876		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1877		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1878			*errnum = EBADF;
1879			flag = U8_STRCMP_CS;
1880		}
1881	}
1882
1883	if (flag == U8_STRCMP_CS) {
1884		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1885	}
1886
1887	n1 = strlen(s1);
1888	n2 = strlen(s2);
1889	if (n != 0) {
1890		if (n < n1)
1891			n1 = n;
1892		if (n < n2)
1893			n2 = n;
1894	}
1895
1896	/*
1897	 * Simple case conversion can be done much faster and so we do
1898	 * them separately here.
1899	 */
1900	if (flag == U8_STRCMP_CI_UPPER) {
1901		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1902		    n1, n2, B_TRUE, errnum));
1903	} else if (flag == U8_STRCMP_CI_LOWER) {
1904		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1905		    n1, n2, B_FALSE, errnum));
1906	}
1907
1908	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1909	    flag, errnum));
1910}
1911
1912size_t
1913u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1914	int flag, size_t unicode_version, int *errnum)
1915{
1916	int f;
1917	int sz;
1918	uchar_t *ib;
1919	uchar_t *ibtail;
1920	uchar_t *ob;
1921	uchar_t *obtail;
1922	boolean_t do_not_ignore_null;
1923	boolean_t do_not_ignore_invalid;
1924	boolean_t is_it_toupper;
1925	boolean_t is_it_tolower;
1926	boolean_t canonical_decomposition;
1927	boolean_t compatibility_decomposition;
1928	boolean_t canonical_composition;
1929	size_t ret_val;
1930	size_t i;
1931	size_t j;
1932	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1933	u8_normalization_states_t state;
1934
1935	if (unicode_version > U8_UNICODE_LATEST) {
1936		*errnum = ERANGE;
1937		return ((size_t)-1);
1938	}
1939
1940	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1941	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1942		*errnum = EBADF;
1943		return ((size_t)-1);
1944	}
1945
1946	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1947	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1948	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1949		*errnum = EBADF;
1950		return ((size_t)-1);
1951	}
1952
1953	if (inarray == NULL || *inlen == 0)
1954		return (0);
1955
1956	if (outarray == NULL) {
1957		*errnum = E2BIG;
1958		return ((size_t)-1);
1959	}
1960
1961	ib = (uchar_t *)inarray;
1962	ob = (uchar_t *)outarray;
1963	ibtail = ib + *inlen;
1964	obtail = ob + *outlen;
1965
1966	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1967	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1968	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1969	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1970
1971	ret_val = 0;
1972
1973	/*
1974	 * If we don't have a normalization flag set, we do the simple case
1975	 * conversion based text preparation separately below. Text
1976	 * preparation involving Normalization will be done in the false task
1977	 * block, again, separately since it will take much more time and
1978	 * resource than doing simple case conversions.
1979	 */
1980	if (f == 0) {
1981		while (ib < ibtail) {
1982			if (*ib == '\0' && do_not_ignore_null)
1983				break;
1984
1985			sz = u8_number_of_bytes[*ib];
1986
1987			if (sz < 0) {
1988				if (do_not_ignore_invalid) {
1989					*errnum = EILSEQ;
1990					ret_val = (size_t)-1;
1991					break;
1992				}
1993
1994				sz = 1;
1995				ret_val++;
1996			}
1997
1998			if (sz == 1) {
1999				if (ob >= obtail) {
2000					*errnum = E2BIG;
2001					ret_val = (size_t)-1;
2002					break;
2003				}
2004
2005				if (is_it_toupper)
2006					*ob = U8_ASCII_TOUPPER(*ib);
2007				else if (is_it_tolower)
2008					*ob = U8_ASCII_TOLOWER(*ib);
2009				else
2010					*ob = *ib;
2011				ib++;
2012				ob++;
2013			} else if ((ib + sz) > ibtail) {
2014				if (do_not_ignore_invalid) {
2015					*errnum = EINVAL;
2016					ret_val = (size_t)-1;
2017					break;
2018				}
2019
2020				if ((obtail - ob) < (ibtail - ib)) {
2021					*errnum = E2BIG;
2022					ret_val = (size_t)-1;
2023					break;
2024				}
2025
2026				/*
2027				 * We treat the remaining incomplete character
2028				 * bytes as a character.
2029				 */
2030				ret_val++;
2031
2032				while (ib < ibtail)
2033					*ob++ = *ib++;
2034			} else {
2035				if (is_it_toupper || is_it_tolower) {
2036					i = do_case_conv(unicode_version, u8s,
2037					    ib, sz, is_it_toupper);
2038
2039					if ((obtail - ob) < i) {
2040						*errnum = E2BIG;
2041						ret_val = (size_t)-1;
2042						break;
2043					}
2044
2045					ib += sz;
2046
2047					for (sz = 0; sz < i; sz++)
2048						*ob++ = u8s[sz];
2049				} else {
2050					if ((obtail - ob) < sz) {
2051						*errnum = E2BIG;
2052						ret_val = (size_t)-1;
2053						break;
2054					}
2055
2056					for (i = 0; i < sz; i++)
2057						*ob++ = *ib++;
2058				}
2059			}
2060		}
2061	} else {
2062		canonical_decomposition = flag & U8_CANON_DECOMP;
2063		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2064		canonical_composition = flag & U8_CANON_COMP;
2065
2066		while (ib < ibtail) {
2067			if (*ib == '\0' && do_not_ignore_null)
2068				break;
2069
2070			/*
2071			 * If the current character is a 7-bit ASCII
2072			 * character and it is the last character, or,
2073			 * if the current character is a 7-bit ASCII
2074			 * character and the next character is also a 7-bit
2075			 * ASCII character, then, we copy over this
2076			 * character without going through collect_a_seq().
2077			 *
2078			 * In any other cases, we need to look further with
2079			 * the collect_a_seq() function.
2080			 */
2081			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2082			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2083				if (ob >= obtail) {
2084					*errnum = E2BIG;
2085					ret_val = (size_t)-1;
2086					break;
2087				}
2088
2089				if (is_it_toupper)
2090					*ob = U8_ASCII_TOUPPER(*ib);
2091				else if (is_it_tolower)
2092					*ob = U8_ASCII_TOLOWER(*ib);
2093				else
2094					*ob = *ib;
2095				ib++;
2096				ob++;
2097			} else {
2098				*errnum = 0;
2099				state = U8_STATE_START;
2100
2101				j = collect_a_seq(unicode_version, u8s,
2102				    &ib, ibtail,
2103				    is_it_toupper,
2104				    is_it_tolower,
2105				    canonical_decomposition,
2106				    compatibility_decomposition,
2107				    canonical_composition,
2108				    errnum, &state);
2109
2110				if (*errnum && do_not_ignore_invalid) {
2111					ret_val = (size_t)-1;
2112					break;
2113				}
2114
2115				if ((obtail - ob) < j) {
2116					*errnum = E2BIG;
2117					ret_val = (size_t)-1;
2118					break;
2119				}
2120
2121				for (i = 0; i < j; i++)
2122					*ob++ = u8s[i];
2123			}
2124		}
2125	}
2126
2127	*inlen = ibtail - ib;
2128	*outlen = obtail - ob;
2129
2130	return (ret_val);
2131}
2132