1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28 29/* 30 * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). 31 * 32 * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), 33 * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also 34 * the section 3C man pages. 35 * Interface stability: Committed. 36 */ 37 38#include <sys/types.h> 39#ifdef _KERNEL 40#include <sys/param.h> 41#include <sys/sysmacros.h> 42#include <sys/systm.h> 43#include <sys/debug.h> 44#include <sys/kmem.h> 45#include <sys/sunddi.h> 46#else 47#include <strings.h> 48#endif /* _KERNEL */ 49#include <sys/byteorder.h> 50#include <sys/errno.h> 51#include <sys/u8_textprep.h> 52#include <sys/u8_textprep_data.h> 53 54 55/* The maximum possible number of bytes in a UTF-8 character. */ 56#define U8_MB_CUR_MAX (4) 57 58/* 59 * The maximum number of bytes needed for a UTF-8 character to cover 60 * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. 61 */ 62#define U8_MAX_BYTES_UCS2 (3) 63 64/* The maximum possible number of bytes in a Stream-Safe Text. */ 65#define U8_STREAM_SAFE_TEXT_MAX (128) 66 67/* 68 * The maximum number of characters in a combining/conjoining sequence and 69 * the actual upperbound limit of a combining/conjoining sequence. 70 */ 71#define U8_MAX_CHARS_A_SEQ (32) 72#define U8_UPPER_LIMIT_IN_A_SEQ (31) 73 74/* The combining class value for Starter. */ 75#define U8_COMBINING_CLASS_STARTER (0) 76 77/* 78 * Some Hangul related macros at below. 79 * 80 * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, 81 * Vowels, and optional Trailing consonants in Unicode scalar values. 82 * 83 * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not 84 * the actual U+11A8. This is due to that the trailing consonant is optional 85 * and thus we are doing a pre-calculation of subtracting one. 86 * 87 * Each of 19 modern leading consonants has total 588 possible syllables since 88 * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for 89 * no trailing consonant case, i.e., 21 x 28 = 588. 90 * 91 * We also have bunch of Hangul related macros at below. Please bear in mind 92 * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is 93 * a Hangul Jamo or not but the value does not guarantee that it is a Hangul 94 * Jamo; it just guarantee that it will be most likely. 95 */ 96#define U8_HANGUL_SYL_FIRST (0xAC00U) 97#define U8_HANGUL_SYL_LAST (0xD7A3U) 98 99#define U8_HANGUL_JAMO_L_FIRST (0x1100U) 100#define U8_HANGUL_JAMO_L_LAST (0x1112U) 101#define U8_HANGUL_JAMO_V_FIRST (0x1161U) 102#define U8_HANGUL_JAMO_V_LAST (0x1175U) 103#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) 104#define U8_HANGUL_JAMO_T_LAST (0x11C2U) 105 106#define U8_HANGUL_V_COUNT (21) 107#define U8_HANGUL_VT_COUNT (588) 108#define U8_HANGUL_T_COUNT (28) 109 110#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) 111 112#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ 113 (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ 114 (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ 115 (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); 116 117#define U8_HANGUL_JAMO_L(u) \ 118 ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) 119 120#define U8_HANGUL_JAMO_V(u) \ 121 ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) 122 123#define U8_HANGUL_JAMO_T(u) \ 124 ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 125 126#define U8_HANGUL_JAMO(u) \ 127 ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 128 129#define U8_HANGUL_SYLLABLE(u) \ 130 ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) 131 132#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ 133 ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) 134 135#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ 136 ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) 137 138/* The types of decomposition mappings. */ 139#define U8_DECOMP_BOTH (0xF5U) 140#define U8_DECOMP_CANONICAL (0xF6U) 141 142/* The indicator for 16-bit table. */ 143#define U8_16BIT_TABLE_INDICATOR (0x8000U) 144 145/* The following are some convenience macros. */ 146#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ 147 (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ 148 (uint32_t)(b3) & 0x3F; 149 150#define U8_SIMPLE_SWAP(a, b, t) \ 151 (t) = (a); \ 152 (a) = (b); \ 153 (b) = (t); 154 155#define U8_ASCII_TOUPPER(c) \ 156 (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) 157 158#define U8_ASCII_TOLOWER(c) \ 159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) 160 161#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) 162/* 163 * The following macro assumes that the two characters that are to be 164 * swapped are adjacent to each other and 'a' comes before 'b'. 165 * 166 * If the assumptions are not met, then, the macro will fail. 167 */ 168#define U8_SWAP_COMB_MARKS(a, b) \ 169 for (k = 0; k < disp[(a)]; k++) \ 170 u8t[k] = u8s[start[(a)] + k]; \ 171 for (k = 0; k < disp[(b)]; k++) \ 172 u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ 173 start[(b)] = start[(a)] + disp[(b)]; \ 174 for (k = 0; k < disp[(a)]; k++) \ 175 u8s[start[(b)] + k] = u8t[k]; \ 176 U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ 177 U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); 178 179/* The possible states during normalization. */ 180typedef enum { 181 U8_STATE_START = 0, 182 U8_STATE_HANGUL_L = 1, 183 U8_STATE_HANGUL_LV = 2, 184 U8_STATE_HANGUL_LVT = 3, 185 U8_STATE_HANGUL_V = 4, 186 U8_STATE_HANGUL_T = 5, 187 U8_STATE_COMBINING_MARK = 6 188} u8_normalization_states_t; 189 190/* 191 * The three vectors at below are used to check bytes of a given UTF-8 192 * character are valid and not containing any malformed byte values. 193 * 194 * We used to have a quite relaxed UTF-8 binary representation but then there 195 * was some security related issues and so the Unicode Consortium defined 196 * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it 197 * one more time at the Unicode 3.2. The following three tables are based on 198 * that. 199 */ 200 201#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) 202 203#define I_ U8_ILLEGAL_CHAR 204#define O_ U8_OUT_OF_RANGE_CHAR 205 206const int8_t u8_number_of_bytes[0x100] = { 207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 215 216/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 217 I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 218 219/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 220 I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 221 222/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 223 I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 224 225/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 226 I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 227 228/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 229 I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 230 231/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 232 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 233 234/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 235 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 236 237/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 238 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, 239}; 240 241#undef I_ 242#undef O_ 243 244const uint8_t u8_valid_min_2nd_byte[0x100] = { 245 0, 0, 0, 0, 0, 0, 0, 0, 246 0, 0, 0, 0, 0, 0, 0, 0, 247 0, 0, 0, 0, 0, 0, 0, 0, 248 0, 0, 0, 0, 0, 0, 0, 0, 249 0, 0, 0, 0, 0, 0, 0, 0, 250 0, 0, 0, 0, 0, 0, 0, 0, 251 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 253 0, 0, 0, 0, 0, 0, 0, 0, 254 0, 0, 0, 0, 0, 0, 0, 0, 255 0, 0, 0, 0, 0, 0, 0, 0, 256 0, 0, 0, 0, 0, 0, 0, 0, 257 0, 0, 0, 0, 0, 0, 0, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261 0, 0, 0, 0, 0, 0, 0, 0, 262 0, 0, 0, 0, 0, 0, 0, 0, 263 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269/* C0 C1 C2 C3 C4 C5 C6 C7 */ 270 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 271/* C8 C9 CA CB CC CD CE CF */ 272 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 273/* D0 D1 D2 D3 D4 D5 D6 D7 */ 274 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 275/* D8 D9 DA DB DC DD DE DF */ 276 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 277/* E0 E1 E2 E3 E4 E5 E6 E7 */ 278 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 279/* E8 E9 EA EB EC ED EE EF */ 280 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 281/* F0 F1 F2 F3 F4 F5 F6 F7 */ 282 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284}; 285 286const uint8_t u8_valid_max_2nd_byte[0x100] = { 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0, 292 0, 0, 0, 0, 0, 0, 0, 0, 293 0, 0, 0, 0, 0, 0, 0, 0, 294 0, 0, 0, 0, 0, 0, 0, 0, 295 0, 0, 0, 0, 0, 0, 0, 0, 296 0, 0, 0, 0, 0, 0, 0, 0, 297 0, 0, 0, 0, 0, 0, 0, 0, 298 0, 0, 0, 0, 0, 0, 0, 0, 299 0, 0, 0, 0, 0, 0, 0, 0, 300 0, 0, 0, 0, 0, 0, 0, 0, 301 0, 0, 0, 0, 0, 0, 0, 0, 302 0, 0, 0, 0, 0, 0, 0, 0, 303 0, 0, 0, 0, 0, 0, 0, 0, 304 0, 0, 0, 0, 0, 0, 0, 0, 305 0, 0, 0, 0, 0, 0, 0, 0, 306 0, 0, 0, 0, 0, 0, 0, 0, 307 0, 0, 0, 0, 0, 0, 0, 0, 308 0, 0, 0, 0, 0, 0, 0, 0, 309 0, 0, 0, 0, 0, 0, 0, 0, 310 0, 0, 0, 0, 0, 0, 0, 0, 311/* C0 C1 C2 C3 C4 C5 C6 C7 */ 312 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 313/* C8 C9 CA CB CC CD CE CF */ 314 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 315/* D0 D1 D2 D3 D4 D5 D6 D7 */ 316 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 317/* D8 D9 DA DB DC DD DE DF */ 318 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 319/* E0 E1 E2 E3 E4 E5 E6 E7 */ 320 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 321/* E8 E9 EA EB EC ED EE EF */ 322 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 323/* F0 F1 F2 F3 F4 F5 F6 F7 */ 324 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 325 0, 0, 0, 0, 0, 0, 0, 0, 326}; 327 328 329/* 330 * The u8_validate() validates on the given UTF-8 character string and 331 * calculate the byte length. It is quite similar to mblen(3C) except that 332 * this will validate against the list of characters if required and 333 * specific to UTF-8 and Unicode. 334 */ 335int 336u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) 337{ 338 uchar_t *ib; 339 uchar_t *ibtail; 340 uchar_t **p; 341 uchar_t *s1; 342 uchar_t *s2; 343 uchar_t f; 344 int sz; 345 size_t i; 346 int ret_val; 347 boolean_t second; 348 boolean_t no_need_to_validate_entire; 349 boolean_t check_additional; 350 boolean_t validate_ucs2_range_only; 351 352 if (! u8str) 353 return (0); 354 355 ib = (uchar_t *)u8str; 356 ibtail = ib + n; 357 358 ret_val = 0; 359 360 no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); 361 check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; 362 validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; 363 364 while (ib < ibtail) { 365 /* 366 * The first byte of a UTF-8 character tells how many 367 * bytes will follow for the character. If the first byte 368 * is an illegal byte value or out of range value, we just 369 * return -1 with an appropriate error number. 370 */ 371 sz = u8_number_of_bytes[*ib]; 372 if (sz == U8_ILLEGAL_CHAR) { 373 *errnum = EILSEQ; 374 return (-1); 375 } 376 377 if (sz == U8_OUT_OF_RANGE_CHAR || 378 (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { 379 *errnum = ERANGE; 380 return (-1); 381 } 382 383 /* 384 * If we don't have enough bytes to check on, that's also 385 * an error. As you can see, we give illegal byte sequence 386 * checking higher priority then EINVAL cases. 387 */ 388 if ((ibtail - ib) < sz) { 389 *errnum = EINVAL; 390 return (-1); 391 } 392 393 if (sz == 1) { 394 ib++; 395 ret_val++; 396 } else { 397 /* 398 * Check on the multi-byte UTF-8 character. For more 399 * details on this, see comment added for the used 400 * data structures at the beginning of the file. 401 */ 402 f = *ib++; 403 ret_val++; 404 second = B_TRUE; 405 for (i = 1; i < sz; i++) { 406 if (second) { 407 if (*ib < u8_valid_min_2nd_byte[f] || 408 *ib > u8_valid_max_2nd_byte[f]) { 409 *errnum = EILSEQ; 410 return (-1); 411 } 412 second = B_FALSE; 413 } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { 414 *errnum = EILSEQ; 415 return (-1); 416 } 417 ib++; 418 ret_val++; 419 } 420 } 421 422 if (check_additional) { 423 for (p = (uchar_t **)list, i = 0; p[i]; i++) { 424 s1 = ib - sz; 425 s2 = p[i]; 426 while (s1 < ib) { 427 if (*s1 != *s2 || *s2 == '\0') 428 break; 429 s1++; 430 s2++; 431 } 432 433 if (s1 >= ib && *s2 == '\0') { 434 *errnum = EBADF; 435 return (-1); 436 } 437 } 438 } 439 440 if (no_need_to_validate_entire) 441 break; 442 } 443 444 return (ret_val); 445} 446 447/* 448 * The do_case_conv() looks at the mapping tables and returns found 449 * bytes if any. If not found, the input bytes are returned. The function 450 * always terminate the return bytes with a null character assuming that 451 * there are plenty of room to do so. 452 * 453 * The case conversions are simple case conversions mapping a character to 454 * another character as specified in the Unicode data. The byte size of 455 * the mapped character could be different from that of the input character. 456 * 457 * The return value is the byte length of the returned character excluding 458 * the terminating null byte. 459 */ 460static size_t 461do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) 462{ 463 size_t i; 464 uint16_t b1 = 0; 465 uint16_t b2 = 0; 466 uint16_t b3 = 0; 467 uint16_t b3_tbl; 468 uint16_t b3_base; 469 uint16_t b4 = 0; 470 size_t start_id; 471 size_t end_id; 472 473 /* 474 * At this point, the only possible values for sz are 2, 3, and 4. 475 * The u8s should point to a vector that is well beyond the size of 476 * 5 bytes. 477 */ 478 if (sz == 2) { 479 b3 = u8s[0] = s[0]; 480 b4 = u8s[1] = s[1]; 481 } else if (sz == 3) { 482 b2 = u8s[0] = s[0]; 483 b3 = u8s[1] = s[1]; 484 b4 = u8s[2] = s[2]; 485 } else if (sz == 4) { 486 b1 = u8s[0] = s[0]; 487 b2 = u8s[1] = s[1]; 488 b3 = u8s[2] = s[2]; 489 b4 = u8s[3] = s[3]; 490 } else { 491 /* This is not possible but just in case as a fallback. */ 492 if (is_it_toupper) 493 *u8s = U8_ASCII_TOUPPER(*s); 494 else 495 *u8s = U8_ASCII_TOLOWER(*s); 496 u8s[1] = '\0'; 497 498 return (1); 499 } 500 u8s[sz] = '\0'; 501 502 /* 503 * Let's find out if we have a corresponding character. 504 */ 505 b1 = u8_common_b1_tbl[uv][b1]; 506 if (b1 == U8_TBL_ELEMENT_NOT_DEF) 507 return ((size_t)sz); 508 509 b2 = u8_case_common_b2_tbl[uv][b1][b2]; 510 if (b2 == U8_TBL_ELEMENT_NOT_DEF) 511 return ((size_t)sz); 512 513 if (is_it_toupper) { 514 b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; 515 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 516 return ((size_t)sz); 517 518 start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; 519 end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; 520 521 /* Either there is no match or an error at the table. */ 522 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 523 return ((size_t)sz); 524 525 b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; 526 527 for (i = 0; start_id < end_id; start_id++) 528 u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; 529 } else { 530 b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; 531 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 532 return ((size_t)sz); 533 534 start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; 535 end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; 536 537 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 538 return ((size_t)sz); 539 540 b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; 541 542 for (i = 0; start_id < end_id; start_id++) 543 u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; 544 } 545 546 /* 547 * If i is still zero, that means there is no corresponding character. 548 */ 549 if (i == 0) 550 return ((size_t)sz); 551 552 u8s[i] = '\0'; 553 554 return (i); 555} 556 557/* 558 * The do_case_compare() function compares the two input strings, s1 and s2, 559 * one character at a time doing case conversions if applicable and return 560 * the comparison result as like strcmp(). 561 * 562 * Since, in empirical sense, most of text data are 7-bit ASCII characters, 563 * we treat the 7-bit ASCII characters as a special case trying to yield 564 * faster processing time. 565 */ 566static int 567do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, 568 size_t n2, boolean_t is_it_toupper, int *errnum) 569{ 570 int f; 571 int sz1; 572 int sz2; 573 size_t j; 574 size_t i1; 575 size_t i2; 576 uchar_t u8s1[U8_MB_CUR_MAX + 1]; 577 uchar_t u8s2[U8_MB_CUR_MAX + 1]; 578 579 i1 = i2 = 0; 580 while (i1 < n1 && i2 < n2) { 581 /* 582 * Find out what would be the byte length for this UTF-8 583 * character at string s1 and also find out if this is 584 * an illegal start byte or not and if so, issue a proper 585 * error number and yet treat this byte as a character. 586 */ 587 sz1 = u8_number_of_bytes[*s1]; 588 if (sz1 < 0) { 589 *errnum = EILSEQ; 590 sz1 = 1; 591 } 592 593 /* 594 * For 7-bit ASCII characters mainly, we do a quick case 595 * conversion right at here. 596 * 597 * If we don't have enough bytes for this character, issue 598 * an EINVAL error and use what are available. 599 * 600 * If we have enough bytes, find out if there is 601 * a corresponding uppercase character and if so, copy over 602 * the bytes for a comparison later. If there is no 603 * corresponding uppercase character, then, use what we have 604 * for the comparison. 605 */ 606 if (sz1 == 1) { 607 if (is_it_toupper) 608 u8s1[0] = U8_ASCII_TOUPPER(*s1); 609 else 610 u8s1[0] = U8_ASCII_TOLOWER(*s1); 611 s1++; 612 u8s1[1] = '\0'; 613 } else if ((i1 + sz1) > n1) { 614 *errnum = EINVAL; 615 for (j = 0; (i1 + j) < n1; ) 616 u8s1[j++] = *s1++; 617 u8s1[j] = '\0'; 618 } else { 619 (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); 620 s1 += sz1; 621 } 622 623 /* Do the same for the string s2. */ 624 sz2 = u8_number_of_bytes[*s2]; 625 if (sz2 < 0) { 626 *errnum = EILSEQ; 627 sz2 = 1; 628 } 629 630 if (sz2 == 1) { 631 if (is_it_toupper) 632 u8s2[0] = U8_ASCII_TOUPPER(*s2); 633 else 634 u8s2[0] = U8_ASCII_TOLOWER(*s2); 635 s2++; 636 u8s2[1] = '\0'; 637 } else if ((i2 + sz2) > n2) { 638 *errnum = EINVAL; 639 for (j = 0; (i2 + j) < n2; ) 640 u8s2[j++] = *s2++; 641 u8s2[j] = '\0'; 642 } else { 643 (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); 644 s2 += sz2; 645 } 646 647 /* Now compare the two characters. */ 648 if (sz1 == 1 && sz2 == 1) { 649 if (*u8s1 > *u8s2) 650 return (1); 651 if (*u8s1 < *u8s2) 652 return (-1); 653 } else { 654 f = strcmp((const char *)u8s1, (const char *)u8s2); 655 if (f != 0) 656 return (f); 657 } 658 659 /* 660 * They were the same. Let's move on to the next 661 * characters then. 662 */ 663 i1 += sz1; 664 i2 += sz2; 665 } 666 667 /* 668 * We compared until the end of either or both strings. 669 * 670 * If we reached to or went over the ends for the both, that means 671 * they are the same. 672 * 673 * If we reached only one of the two ends, that means the other string 674 * has something which then the fact can be used to determine 675 * the return value. 676 */ 677 if (i1 >= n1) { 678 if (i2 >= n2) 679 return (0); 680 return (-1); 681 } 682 return (1); 683} 684 685/* 686 * The combining_class() function checks on the given bytes and find out 687 * the corresponding Unicode combining class value. The return value 0 means 688 * it is a Starter. Any illegal UTF-8 character will also be treated as 689 * a Starter. 690 */ 691static uchar_t 692combining_class(size_t uv, uchar_t *s, size_t sz) 693{ 694 uint16_t b1 = 0; 695 uint16_t b2 = 0; 696 uint16_t b3 = 0; 697 uint16_t b4 = 0; 698 699 if (sz == 1 || sz > 4) 700 return (0); 701 702 if (sz == 2) { 703 b3 = s[0]; 704 b4 = s[1]; 705 } else if (sz == 3) { 706 b2 = s[0]; 707 b3 = s[1]; 708 b4 = s[2]; 709 } else if (sz == 4) { 710 b1 = s[0]; 711 b2 = s[1]; 712 b3 = s[2]; 713 b4 = s[3]; 714 } 715 716 b1 = u8_common_b1_tbl[uv][b1]; 717 if (b1 == U8_TBL_ELEMENT_NOT_DEF) 718 return (0); 719 720 b2 = u8_combining_class_b2_tbl[uv][b1][b2]; 721 if (b2 == U8_TBL_ELEMENT_NOT_DEF) 722 return (0); 723 724 b3 = u8_combining_class_b3_tbl[uv][b2][b3]; 725 if (b3 == U8_TBL_ELEMENT_NOT_DEF) 726 return (0); 727 728 return (u8_combining_class_b4_tbl[uv][b3][b4]); 729} 730 731/* 732 * The do_decomp() function finds out a matching decomposition if any 733 * and return. If there is no match, the input bytes are copied and returned. 734 * The function also checks if there is a Hangul, decomposes it if necessary 735 * and returns. 736 * 737 * To save time, a single byte 7-bit ASCII character should be handled by 738 * the caller. 739 * 740 * The function returns the number of bytes returned sans always terminating 741 * the null byte. It will also return a state that will tell if there was 742 * a Hangul character decomposed which then will be used by the caller. 743 */ 744static size_t 745do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, 746 boolean_t canonical_decomposition, u8_normalization_states_t *state) 747{ 748 uint16_t b1 = 0; 749 uint16_t b2 = 0; 750 uint16_t b3 = 0; 751 uint16_t b3_tbl; 752 uint16_t b3_base; 753 uint16_t b4 = 0; 754 size_t start_id; 755 size_t end_id; 756 size_t i; 757 uint32_t u1; 758 759 if (sz == 2) { 760 b3 = u8s[0] = s[0]; 761 b4 = u8s[1] = s[1]; 762 u8s[2] = '\0'; 763 } else if (sz == 3) { 764 /* Convert it to a Unicode scalar value. */ 765 U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); 766 767 /* 768 * If this is a Hangul syllable, we decompose it into 769 * a leading consonant, a vowel, and an optional trailing 770 * consonant and then return. 771 */ 772 if (U8_HANGUL_SYLLABLE(u1)) { 773 u1 -= U8_HANGUL_SYL_FIRST; 774 775 b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; 776 b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) 777 / U8_HANGUL_T_COUNT; 778 b3 = u1 % U8_HANGUL_T_COUNT; 779 780 U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); 781 U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); 782 if (b3) { 783 b3 += U8_HANGUL_JAMO_T_FIRST; 784 U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); 785 786 u8s[9] = '\0'; 787 *state = U8_STATE_HANGUL_LVT; 788 return (9); 789 } 790 791 u8s[6] = '\0'; 792 *state = U8_STATE_HANGUL_LV; 793 return (6); 794 } 795 796 b2 = u8s[0] = s[0]; 797 b3 = u8s[1] = s[1]; 798 b4 = u8s[2] = s[2]; 799 u8s[3] = '\0'; 800 801 /* 802 * If this is a Hangul Jamo, we know there is nothing 803 * further that we can decompose. 804 */ 805 if (U8_HANGUL_JAMO_L(u1)) { 806 *state = U8_STATE_HANGUL_L; 807 return (3); 808 } 809 810 if (U8_HANGUL_JAMO_V(u1)) { 811 if (*state == U8_STATE_HANGUL_L) 812 *state = U8_STATE_HANGUL_LV; 813 else 814 *state = U8_STATE_HANGUL_V; 815 return (3); 816 } 817 818 if (U8_HANGUL_JAMO_T(u1)) { 819 if (*state == U8_STATE_HANGUL_LV) 820 *state = U8_STATE_HANGUL_LVT; 821 else 822 *state = U8_STATE_HANGUL_T; 823 return (3); 824 } 825 } else if (sz == 4) { 826 b1 = u8s[0] = s[0]; 827 b2 = u8s[1] = s[1]; 828 b3 = u8s[2] = s[2]; 829 b4 = u8s[3] = s[3]; 830 u8s[4] = '\0'; 831 } else { 832 /* 833 * This is a fallback and should not happen if the function 834 * was called properly. 835 */ 836 u8s[0] = s[0]; 837 u8s[1] = '\0'; 838 *state = U8_STATE_START; 839 return (1); 840 } 841 842 /* 843 * At this point, this rountine does not know what it would get. 844 * The caller should sort it out if the state isn't a Hangul one. 845 */ 846 *state = U8_STATE_START; 847 848 /* Try to find matching decomposition mapping byte sequence. */ 849 b1 = u8_common_b1_tbl[uv][b1]; 850 if (b1 == U8_TBL_ELEMENT_NOT_DEF) 851 return ((size_t)sz); 852 853 b2 = u8_decomp_b2_tbl[uv][b1][b2]; 854 if (b2 == U8_TBL_ELEMENT_NOT_DEF) 855 return ((size_t)sz); 856 857 b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; 858 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 859 return ((size_t)sz); 860 861 /* 862 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR 863 * which is 0x8000, this means we couldn't fit the mappings into 864 * the cardinality of a unsigned byte. 865 */ 866 if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 867 b3_tbl -= U8_16BIT_TABLE_INDICATOR; 868 start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; 869 end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 870 } else { 871 start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; 872 end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; 873 } 874 875 /* This also means there wasn't any matching decomposition. */ 876 if (start_id >= end_id) 877 return ((size_t)sz); 878 879 /* 880 * The final table for decomposition mappings has three types of 881 * byte sequences depending on whether a mapping is for compatibility 882 * decomposition, canonical decomposition, or both like the following: 883 * 884 * (1) Compatibility decomposition mappings: 885 * 886 * +---+---+-...-+---+ 887 * | B0| B1| ... | Bm| 888 * +---+---+-...-+---+ 889 * 890 * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). 891 * 892 * (2) Canonical decomposition mappings: 893 * 894 * +---+---+---+-...-+---+ 895 * | T | b0| b1| ... | bn| 896 * +---+---+---+-...-+---+ 897 * 898 * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). 899 * 900 * (3) Both mappings: 901 * 902 * +---+---+---+---+-...-+---+---+---+-...-+---+ 903 * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| 904 * +---+---+---+---+-...-+---+---+---+-...-+---+ 905 * 906 * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement 907 * byte, b0 to bn are canonical mapping bytes and B0 to Bm are 908 * compatibility mapping bytes. 909 * 910 * Note that compatibility decomposition means doing recursive 911 * decompositions using both compatibility decomposition mappings and 912 * canonical decomposition mappings. On the other hand, canonical 913 * decomposition means doing recursive decompositions using only 914 * canonical decomposition mappings. Since the table we have has gone 915 * through the recursions already, we do not need to do so during 916 * runtime, i.e., the table has been completely flattened out 917 * already. 918 */ 919 920 b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; 921 922 /* Get the type, T, of the byte sequence. */ 923 b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; 924 925 /* 926 * If necessary, adjust start_id, end_id, or both. Note that if 927 * this is compatibility decomposition mapping, there is no 928 * adjustment. 929 */ 930 if (canonical_decomposition) { 931 /* Is the mapping only for compatibility decomposition? */ 932 if (b1 < U8_DECOMP_BOTH) 933 return ((size_t)sz); 934 935 start_id++; 936 937 if (b1 == U8_DECOMP_BOTH) { 938 end_id = start_id + 939 u8_decomp_final_tbl[uv][b3_base + start_id]; 940 start_id++; 941 } 942 } else { 943 /* 944 * Unless this is a compatibility decomposition mapping, 945 * we adjust the start_id. 946 */ 947 if (b1 == U8_DECOMP_BOTH) { 948 start_id++; 949 start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; 950 } else if (b1 == U8_DECOMP_CANONICAL) { 951 start_id++; 952 } 953 } 954 955 for (i = 0; start_id < end_id; start_id++) 956 u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; 957 u8s[i] = '\0'; 958 959 return (i); 960} 961 962/* 963 * The find_composition_start() function uses the character bytes given and 964 * find out the matching composition mappings if any and return the address 965 * to the composition mappings as explained in the do_composition(). 966 */ 967static uchar_t * 968find_composition_start(size_t uv, uchar_t *s, size_t sz) 969{ 970 uint16_t b1 = 0; 971 uint16_t b2 = 0; 972 uint16_t b3 = 0; 973 uint16_t b3_tbl; 974 uint16_t b3_base; 975 uint16_t b4 = 0; 976 size_t start_id; 977 size_t end_id; 978 979 if (sz == 1) { 980 b4 = s[0]; 981 } else if (sz == 2) { 982 b3 = s[0]; 983 b4 = s[1]; 984 } else if (sz == 3) { 985 b2 = s[0]; 986 b3 = s[1]; 987 b4 = s[2]; 988 } else if (sz == 4) { 989 b1 = s[0]; 990 b2 = s[1]; 991 b3 = s[2]; 992 b4 = s[3]; 993 } else { 994 /* 995 * This is a fallback and should not happen if the function 996 * was called properly. 997 */ 998 return (NULL); 999 } 1000 1001 b1 = u8_composition_b1_tbl[uv][b1]; 1002 if (b1 == U8_TBL_ELEMENT_NOT_DEF) 1003 return (NULL); 1004 1005 b2 = u8_composition_b2_tbl[uv][b1][b2]; 1006 if (b2 == U8_TBL_ELEMENT_NOT_DEF) 1007 return (NULL); 1008 1009 b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; 1010 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 1011 return (NULL); 1012 1013 if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 1014 b3_tbl -= U8_16BIT_TABLE_INDICATOR; 1015 start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; 1016 end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 1017 } else { 1018 start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; 1019 end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; 1020 } 1021 1022 if (start_id >= end_id) 1023 return (NULL); 1024 1025 b3_base = u8_composition_b3_tbl[uv][b2][b3].base; 1026 1027 return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); 1028} 1029 1030/* 1031 * The blocked() function checks on the combining class values of previous 1032 * characters in this sequence and return whether it is blocked or not. 1033 */ 1034static boolean_t 1035blocked(uchar_t *comb_class, size_t last) 1036{ 1037 uchar_t my_comb_class; 1038 size_t i; 1039 1040 my_comb_class = comb_class[last]; 1041 for (i = 1; i < last; i++) 1042 if (comb_class[i] >= my_comb_class || 1043 comb_class[i] == U8_COMBINING_CLASS_STARTER) 1044 return (B_TRUE); 1045 1046 return (B_FALSE); 1047} 1048 1049/* 1050 * The do_composition() reads the character string pointed by 's' and 1051 * do necessary canonical composition and then copy over the result back to 1052 * the 's'. 1053 * 1054 * The input argument 's' cannot contain more than 32 characters. 1055 */ 1056static size_t 1057do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, 1058 uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) 1059{ 1060 uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; 1061 uchar_t tc[U8_MB_CUR_MAX]; 1062 uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; 1063 size_t saved_marks_count; 1064 uchar_t *p; 1065 uchar_t *saved_p; 1066 uchar_t *q; 1067 size_t i; 1068 size_t saved_i; 1069 size_t j; 1070 size_t k; 1071 size_t l; 1072 size_t C; 1073 size_t saved_l; 1074 size_t size; 1075 uint32_t u1; 1076 uint32_t u2; 1077 boolean_t match_not_found = B_TRUE; 1078 1079 /* 1080 * This should never happen unless the callers are doing some strange 1081 * and unexpected things. 1082 * 1083 * The "last" is the index pointing to the last character not last + 1. 1084 */ 1085 if (last >= U8_MAX_CHARS_A_SEQ) 1086 last = U8_UPPER_LIMIT_IN_A_SEQ; 1087 1088 for (i = l = 0; i <= last; i++) { 1089 /* 1090 * The last or any non-Starters at the beginning, we don't 1091 * have any chance to do composition and so we just copy them 1092 * to the temporary buffer. 1093 */ 1094 if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { 1095SAVE_THE_CHAR: 1096 p = s + start[i]; 1097 size = disp[i]; 1098 for (k = 0; k < size; k++) 1099 t[l++] = *p++; 1100 continue; 1101 } 1102 1103 /* 1104 * If this could be a start of Hangul Jamos, then, we try to 1105 * conjoin them. 1106 */ 1107 if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { 1108 U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], 1109 s[start[i] + 1], s[start[i] + 2]); 1110 U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], 1111 s[start[i] + 4], s[start[i] + 5]); 1112 1113 if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { 1114 u1 -= U8_HANGUL_JAMO_L_FIRST; 1115 u2 -= U8_HANGUL_JAMO_V_FIRST; 1116 u1 = U8_HANGUL_SYL_FIRST + 1117 (u1 * U8_HANGUL_V_COUNT + u2) * 1118 U8_HANGUL_T_COUNT; 1119 1120 i += 2; 1121 if (i <= last) { 1122 U8_PUT_3BYTES_INTO_UTF32(u2, 1123 s[start[i]], s[start[i] + 1], 1124 s[start[i] + 2]); 1125 1126 if (U8_HANGUL_JAMO_T(u2)) { 1127 u1 += u2 - 1128 U8_HANGUL_JAMO_T_FIRST; 1129 i++; 1130 } 1131 } 1132 1133 U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); 1134 i--; 1135 l += 3; 1136 continue; 1137 } 1138 } 1139 1140 /* 1141 * Let's then find out if this Starter has composition 1142 * mapping. 1143 */ 1144 p = find_composition_start(uv, s + start[i], disp[i]); 1145 if (p == NULL) 1146 goto SAVE_THE_CHAR; 1147 1148 /* 1149 * We have a Starter with composition mapping and the next 1150 * character is a non-Starter. Let's try to find out if 1151 * we can do composition. 1152 */ 1153 1154 saved_p = p; 1155 saved_i = i; 1156 saved_l = l; 1157 saved_marks_count = 0; 1158 1159TRY_THE_NEXT_MARK: 1160 q = s + start[++i]; 1161 size = disp[i]; 1162 1163 /* 1164 * The next for() loop compares the non-Starter pointed by 1165 * 'q' with the possible (joinable) characters pointed by 'p'. 1166 * 1167 * The composition final table entry pointed by the 'p' 1168 * looks like the following: 1169 * 1170 * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1171 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | 1172 * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1173 * 1174 * where C is the count byte indicating the number of 1175 * mapping pairs where each pair would be look like 1176 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second 1177 * character of a canonical decomposition and the B0-Bm are 1178 * the bytes of a matching composite character. The F is 1179 * a filler byte after each character as the separator. 1180 */ 1181 1182 match_not_found = B_TRUE; 1183 1184 for (C = *p++; C > 0; C--) { 1185 for (k = 0; k < size; p++, k++) 1186 if (*p != q[k]) 1187 break; 1188 1189 /* Have we found it? */ 1190 if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { 1191 match_not_found = B_FALSE; 1192 1193 l = saved_l; 1194 1195 while (*++p != U8_TBL_ELEMENT_FILLER) 1196 t[l++] = *p; 1197 1198 break; 1199 } 1200 1201 /* We didn't find; skip to the next pair. */ 1202 if (*p != U8_TBL_ELEMENT_FILLER) 1203 while (*++p != U8_TBL_ELEMENT_FILLER) 1204 ; 1205 while (*++p != U8_TBL_ELEMENT_FILLER) 1206 ; 1207 p++; 1208 } 1209 1210 /* 1211 * If there was no match, we will need to save the combining 1212 * mark for later appending. After that, if the next one 1213 * is a non-Starter and not blocked, then, we try once 1214 * again to do composition with the next non-Starter. 1215 * 1216 * If there was no match and this was a Starter, then, 1217 * this is a new start. 1218 * 1219 * If there was a match and a composition done and we have 1220 * more to check on, then, we retrieve a new composition final 1221 * table entry for the composite and then try to do the 1222 * composition again. 1223 */ 1224 1225 if (match_not_found) { 1226 if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { 1227 i--; 1228 goto SAVE_THE_CHAR; 1229 } 1230 1231 saved_marks[saved_marks_count++] = i; 1232 } 1233 1234 if (saved_l == l) { 1235 while (i < last) { 1236 if (blocked(comb_class, i + 1)) 1237 saved_marks[saved_marks_count++] = ++i; 1238 else 1239 break; 1240 } 1241 if (i < last) { 1242 p = saved_p; 1243 goto TRY_THE_NEXT_MARK; 1244 } 1245 } else if (i < last) { 1246 p = find_composition_start(uv, t + saved_l, 1247 l - saved_l); 1248 if (p != NULL) { 1249 saved_p = p; 1250 goto TRY_THE_NEXT_MARK; 1251 } 1252 } 1253 1254 /* 1255 * There is no more composition possible. 1256 * 1257 * If there was no composition what so ever then we copy 1258 * over the original Starter and then append any non-Starters 1259 * remaining at the target string sequentially after that. 1260 */ 1261 1262 if (saved_l == l) { 1263 p = s + start[saved_i]; 1264 size = disp[saved_i]; 1265 for (j = 0; j < size; j++) 1266 t[l++] = *p++; 1267 } 1268 1269 for (k = 0; k < saved_marks_count; k++) { 1270 p = s + start[saved_marks[k]]; 1271 size = disp[saved_marks[k]]; 1272 for (j = 0; j < size; j++) 1273 t[l++] = *p++; 1274 } 1275 } 1276 1277 /* 1278 * If the last character is a Starter and if we have a character 1279 * (possibly another Starter) that can be turned into a composite, 1280 * we do so and we do so until there is no more of composition 1281 * possible. 1282 */ 1283 if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { 1284 p = *os; 1285 saved_l = l - disp[last]; 1286 1287 while (p < oslast) { 1288 size = u8_number_of_bytes[*p]; 1289 if (size <= 1 || (p + size) > oslast) 1290 break; 1291 1292 saved_p = p; 1293 1294 for (i = 0; i < size; i++) 1295 tc[i] = *p++; 1296 1297 q = find_composition_start(uv, t + saved_l, 1298 l - saved_l); 1299 if (q == NULL) { 1300 p = saved_p; 1301 break; 1302 } 1303 1304 match_not_found = B_TRUE; 1305 1306 for (C = *q++; C > 0; C--) { 1307 for (k = 0; k < size; q++, k++) 1308 if (*q != tc[k]) 1309 break; 1310 1311 if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { 1312 match_not_found = B_FALSE; 1313 1314 l = saved_l; 1315 1316 while (*++q != U8_TBL_ELEMENT_FILLER) { 1317 /* 1318 * This is practically 1319 * impossible but we don't 1320 * want to take any chances. 1321 */ 1322 if (l >= 1323 U8_STREAM_SAFE_TEXT_MAX) { 1324 p = saved_p; 1325 goto SAFE_RETURN; 1326 } 1327 t[l++] = *q; 1328 } 1329 1330 break; 1331 } 1332 1333 if (*q != U8_TBL_ELEMENT_FILLER) 1334 while (*++q != U8_TBL_ELEMENT_FILLER) 1335 ; 1336 while (*++q != U8_TBL_ELEMENT_FILLER) 1337 ; 1338 q++; 1339 } 1340 1341 if (match_not_found) { 1342 p = saved_p; 1343 break; 1344 } 1345 } 1346SAFE_RETURN: 1347 *os = p; 1348 } 1349 1350 /* 1351 * Now we copy over the temporary string to the target string. 1352 * Since composition always reduces the number of characters or 1353 * the number of characters stay, we don't need to worry about 1354 * the buffer overflow here. 1355 */ 1356 for (i = 0; i < l; i++) 1357 s[i] = t[i]; 1358 s[l] = '\0'; 1359 1360 return (l); 1361} 1362 1363/* 1364 * The collect_a_seq() function checks on the given string s, collect 1365 * a sequence of characters at u8s, and return the sequence. While it collects 1366 * a sequence, it also applies case conversion, canonical or compatibility 1367 * decomposition, canonical decomposition, or some or all of them and 1368 * in that order. 1369 * 1370 * The collected sequence cannot be bigger than 32 characters since if 1371 * it is having more than 31 characters, the sequence will be terminated 1372 * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into 1373 * a Stream-Safe Text. The collected sequence is always terminated with 1374 * a null byte and the return value is the byte length of the sequence 1375 * including 0. The return value does not include the terminating 1376 * null byte. 1377 */ 1378static size_t 1379collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, 1380 boolean_t is_it_toupper, 1381 boolean_t is_it_tolower, 1382 boolean_t canonical_decomposition, 1383 boolean_t compatibility_decomposition, 1384 boolean_t canonical_composition, 1385 int *errnum, u8_normalization_states_t *state) 1386{ 1387 uchar_t *s; 1388 int sz; 1389 int saved_sz; 1390 size_t i; 1391 size_t j; 1392 size_t k; 1393 size_t l; 1394 uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; 1395 uchar_t disp[U8_MAX_CHARS_A_SEQ]; 1396 uchar_t start[U8_MAX_CHARS_A_SEQ]; 1397 uchar_t u8t[U8_MB_CUR_MAX]; 1398 uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; 1399 uchar_t tc; 1400 size_t last; 1401 size_t saved_last; 1402 uint32_t u1; 1403 1404 /* 1405 * Save the source string pointer which we will return a changed 1406 * pointer if we do processing. 1407 */ 1408 s = *source; 1409 1410 /* 1411 * The following is a fallback for just in case callers are not 1412 * checking the string boundaries before the calling. 1413 */ 1414 if (s >= slast) { 1415 u8s[0] = '\0'; 1416 1417 return (0); 1418 } 1419 1420 /* 1421 * As the first thing, let's collect a character and do case 1422 * conversion if necessary. 1423 */ 1424 1425 sz = u8_number_of_bytes[*s]; 1426 1427 if (sz < 0) { 1428 *errnum = EILSEQ; 1429 1430 u8s[0] = *s++; 1431 u8s[1] = '\0'; 1432 1433 *source = s; 1434 1435 return (1); 1436 } 1437 1438 if (sz == 1) { 1439 if (is_it_toupper) 1440 u8s[0] = U8_ASCII_TOUPPER(*s); 1441 else if (is_it_tolower) 1442 u8s[0] = U8_ASCII_TOLOWER(*s); 1443 else 1444 u8s[0] = *s; 1445 s++; 1446 u8s[1] = '\0'; 1447 } else if ((s + sz) > slast) { 1448 *errnum = EINVAL; 1449 1450 for (i = 0; s < slast; ) 1451 u8s[i++] = *s++; 1452 u8s[i] = '\0'; 1453 1454 *source = s; 1455 1456 return (i); 1457 } else { 1458 if (is_it_toupper || is_it_tolower) { 1459 i = do_case_conv(uv, u8s, s, sz, is_it_toupper); 1460 s += sz; 1461 sz = i; 1462 } else { 1463 for (i = 0; i < sz; ) 1464 u8s[i++] = *s++; 1465 u8s[i] = '\0'; 1466 } 1467 } 1468 1469 /* 1470 * And then canonical/compatibility decomposition followed by 1471 * an optional canonical composition. Please be noted that 1472 * canonical composition is done only when a decomposition is 1473 * done. 1474 */ 1475 if (canonical_decomposition || compatibility_decomposition) { 1476 if (sz == 1) { 1477 *state = U8_STATE_START; 1478 1479 saved_sz = 1; 1480 1481 comb_class[0] = 0; 1482 start[0] = 0; 1483 disp[0] = 1; 1484 1485 last = 1; 1486 } else { 1487 saved_sz = do_decomp(uv, u8s, u8s, sz, 1488 canonical_decomposition, state); 1489 1490 last = 0; 1491 1492 for (i = 0; i < saved_sz; ) { 1493 sz = u8_number_of_bytes[u8s[i]]; 1494 1495 comb_class[last] = combining_class(uv, 1496 u8s + i, sz); 1497 start[last] = i; 1498 disp[last] = sz; 1499 1500 last++; 1501 i += sz; 1502 } 1503 1504 /* 1505 * Decomposition yields various Hangul related 1506 * states but not on combining marks. We need to 1507 * find out at here by checking on the last 1508 * character. 1509 */ 1510 if (*state == U8_STATE_START) { 1511 if (comb_class[last - 1]) 1512 *state = U8_STATE_COMBINING_MARK; 1513 } 1514 } 1515 1516 saved_last = last; 1517 1518 while (s < slast) { 1519 sz = u8_number_of_bytes[*s]; 1520 1521 /* 1522 * If this is an illegal character, an incomplete 1523 * character, or an 7-bit ASCII Starter character, 1524 * then we have collected a sequence; break and let 1525 * the next call deal with the two cases. 1526 * 1527 * Note that this is okay only if you are using this 1528 * function with a fixed length string, not on 1529 * a buffer with multiple calls of one chunk at a time. 1530 */ 1531 if (sz <= 1) { 1532 break; 1533 } else if ((s + sz) > slast) { 1534 break; 1535 } else { 1536 /* 1537 * If the previous character was a Hangul Jamo 1538 * and this character is a Hangul Jamo that 1539 * can be conjoined, we collect the Jamo. 1540 */ 1541 if (*s == U8_HANGUL_JAMO_1ST_BYTE) { 1542 U8_PUT_3BYTES_INTO_UTF32(u1, 1543 *s, *(s + 1), *(s + 2)); 1544 1545 if (U8_HANGUL_COMPOSABLE_L_V(*state, 1546 u1)) { 1547 i = 0; 1548 *state = U8_STATE_HANGUL_LV; 1549 goto COLLECT_A_HANGUL; 1550 } 1551 1552 if (U8_HANGUL_COMPOSABLE_LV_T(*state, 1553 u1)) { 1554 i = 0; 1555 *state = U8_STATE_HANGUL_LVT; 1556 goto COLLECT_A_HANGUL; 1557 } 1558 } 1559 1560 /* 1561 * Regardless of whatever it was, if this is 1562 * a Starter, we don't collect the character 1563 * since that's a new start and we will deal 1564 * with it at the next time. 1565 */ 1566 i = combining_class(uv, s, sz); 1567 if (i == U8_COMBINING_CLASS_STARTER) 1568 break; 1569 1570 /* 1571 * We know the current character is a combining 1572 * mark. If the previous character wasn't 1573 * a Starter (not Hangul) or a combining mark, 1574 * then, we don't collect this combining mark. 1575 */ 1576 if (*state != U8_STATE_START && 1577 *state != U8_STATE_COMBINING_MARK) 1578 break; 1579 1580 *state = U8_STATE_COMBINING_MARK; 1581COLLECT_A_HANGUL: 1582 /* 1583 * If we collected a Starter and combining 1584 * marks up to 30, i.e., total 31 characters, 1585 * then, we terminate this degenerately long 1586 * combining sequence with a U+034F COMBINING 1587 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in 1588 * UTF-8 and turn this into a Stream-Safe 1589 * Text. This will be extremely rare but 1590 * possible. 1591 * 1592 * The following will also guarantee that 1593 * we are not writing more than 32 characters 1594 * plus a NULL at u8s[]. 1595 */ 1596 if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { 1597TURN_STREAM_SAFE: 1598 *state = U8_STATE_START; 1599 comb_class[last] = 0; 1600 start[last] = saved_sz; 1601 disp[last] = 2; 1602 last++; 1603 1604 u8s[saved_sz++] = 0xCD; 1605 u8s[saved_sz++] = 0x8F; 1606 1607 break; 1608 } 1609 1610 /* 1611 * Some combining marks also do decompose into 1612 * another combining mark or marks. 1613 */ 1614 if (*state == U8_STATE_COMBINING_MARK) { 1615 k = last; 1616 l = sz; 1617 i = do_decomp(uv, uts, s, sz, 1618 canonical_decomposition, state); 1619 for (j = 0; j < i; ) { 1620 sz = u8_number_of_bytes[uts[j]]; 1621 1622 comb_class[last] = 1623 combining_class(uv, 1624 uts + j, sz); 1625 start[last] = saved_sz + j; 1626 disp[last] = sz; 1627 1628 last++; 1629 if (last >= 1630 U8_UPPER_LIMIT_IN_A_SEQ) { 1631 last = k; 1632 goto TURN_STREAM_SAFE; 1633 } 1634 j += sz; 1635 } 1636 1637 *state = U8_STATE_COMBINING_MARK; 1638 sz = i; 1639 s += l; 1640 1641 for (i = 0; i < sz; i++) 1642 u8s[saved_sz++] = uts[i]; 1643 } else { 1644 comb_class[last] = i; 1645 start[last] = saved_sz; 1646 disp[last] = sz; 1647 last++; 1648 1649 for (i = 0; i < sz; i++) 1650 u8s[saved_sz++] = *s++; 1651 } 1652 1653 /* 1654 * If this is U+0345 COMBINING GREEK 1655 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., 1656 * iota subscript, and need to be converted to 1657 * uppercase letter, convert it to U+0399 GREEK 1658 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), 1659 * i.e., convert to capital adscript form as 1660 * specified in the Unicode standard. 1661 * 1662 * This is the only special case of (ambiguous) 1663 * case conversion at combining marks and 1664 * probably the standard will never have 1665 * anything similar like this in future. 1666 */ 1667 if (is_it_toupper && sz >= 2 && 1668 u8s[saved_sz - 2] == 0xCD && 1669 u8s[saved_sz - 1] == 0x85) { 1670 u8s[saved_sz - 2] = 0xCE; 1671 u8s[saved_sz - 1] = 0x99; 1672 } 1673 } 1674 } 1675 1676 /* 1677 * Let's try to ensure a canonical ordering for the collected 1678 * combining marks. We do this only if we have collected 1679 * at least one more non-Starter. (The decomposition mapping 1680 * data tables have fully (and recursively) expanded and 1681 * canonically ordered decompositions.) 1682 * 1683 * The U8_SWAP_COMB_MARKS() convenience macro has some 1684 * assumptions and we are meeting the assumptions. 1685 */ 1686 last--; 1687 if (last >= saved_last) { 1688 for (i = 0; i < last; i++) 1689 for (j = last; j > i; j--) 1690 if (comb_class[j] && 1691 comb_class[j - 1] > comb_class[j]) { 1692 U8_SWAP_COMB_MARKS(j - 1, j); 1693 } 1694 } 1695 1696 *source = s; 1697 1698 if (! canonical_composition) { 1699 u8s[saved_sz] = '\0'; 1700 return (saved_sz); 1701 } 1702 1703 /* 1704 * Now do the canonical composition. Note that we do this 1705 * only after a canonical or compatibility decomposition to 1706 * finish up NFC or NFKC. 1707 */ 1708 sz = do_composition(uv, u8s, comb_class, start, disp, last, 1709 &s, slast); 1710 } 1711 1712 *source = s; 1713 1714 return ((size_t)sz); 1715} 1716 1717/* 1718 * The do_norm_compare() function does string comparion based on Unicode 1719 * simple case mappings and Unicode Normalization definitions. 1720 * 1721 * It does so by collecting a sequence of character at a time and comparing 1722 * the collected sequences from the strings. 1723 * 1724 * The meanings on the return values are the same as the usual strcmp(). 1725 */ 1726static int 1727do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, 1728 int flag, int *errnum) 1729{ 1730 int result; 1731 size_t sz1; 1732 size_t sz2; 1733 uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; 1734 uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; 1735 uchar_t *s1last; 1736 uchar_t *s2last; 1737 boolean_t is_it_toupper; 1738 boolean_t is_it_tolower; 1739 boolean_t canonical_decomposition; 1740 boolean_t compatibility_decomposition; 1741 boolean_t canonical_composition; 1742 u8_normalization_states_t state; 1743 1744 s1last = s1 + n1; 1745 s2last = s2 + n2; 1746 1747 is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 1748 is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 1749 canonical_decomposition = flag & U8_CANON_DECOMP; 1750 compatibility_decomposition = flag & U8_COMPAT_DECOMP; 1751 canonical_composition = flag & U8_CANON_COMP; 1752 1753 while (s1 < s1last && s2 < s2last) { 1754 /* 1755 * If the current character is a 7-bit ASCII and the last 1756 * character, or, if the current character and the next 1757 * character are both some 7-bit ASCII characters then 1758 * we treat the current character as a sequence. 1759 * 1760 * In any other cases, we need to call collect_a_seq(). 1761 */ 1762 1763 if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || 1764 ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { 1765 if (is_it_toupper) 1766 u8s1[0] = U8_ASCII_TOUPPER(*s1); 1767 else if (is_it_tolower) 1768 u8s1[0] = U8_ASCII_TOLOWER(*s1); 1769 else 1770 u8s1[0] = *s1; 1771 u8s1[1] = '\0'; 1772 sz1 = 1; 1773 s1++; 1774 } else { 1775 state = U8_STATE_START; 1776 sz1 = collect_a_seq(uv, u8s1, &s1, s1last, 1777 is_it_toupper, is_it_tolower, 1778 canonical_decomposition, 1779 compatibility_decomposition, 1780 canonical_composition, errnum, &state); 1781 } 1782 1783 if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || 1784 ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { 1785 if (is_it_toupper) 1786 u8s2[0] = U8_ASCII_TOUPPER(*s2); 1787 else if (is_it_tolower) 1788 u8s2[0] = U8_ASCII_TOLOWER(*s2); 1789 else 1790 u8s2[0] = *s2; 1791 u8s2[1] = '\0'; 1792 sz2 = 1; 1793 s2++; 1794 } else { 1795 state = U8_STATE_START; 1796 sz2 = collect_a_seq(uv, u8s2, &s2, s2last, 1797 is_it_toupper, is_it_tolower, 1798 canonical_decomposition, 1799 compatibility_decomposition, 1800 canonical_composition, errnum, &state); 1801 } 1802 1803 /* 1804 * Now compare the two characters. If they are the same, 1805 * we move on to the next character sequences. 1806 */ 1807 if (sz1 == 1 && sz2 == 1) { 1808 if (*u8s1 > *u8s2) 1809 return (1); 1810 if (*u8s1 < *u8s2) 1811 return (-1); 1812 } else { 1813 result = strcmp((const char *)u8s1, (const char *)u8s2); 1814 if (result != 0) 1815 return (result); 1816 } 1817 } 1818 1819 /* 1820 * We compared until the end of either or both strings. 1821 * 1822 * If we reached to or went over the ends for the both, that means 1823 * they are the same. 1824 * 1825 * If we reached only one end, that means the other string has 1826 * something which then can be used to determine the return value. 1827 */ 1828 if (s1 >= s1last) { 1829 if (s2 >= s2last) 1830 return (0); 1831 return (-1); 1832 } 1833 return (1); 1834} 1835 1836/* 1837 * The u8_strcmp() function compares two UTF-8 strings quite similar to 1838 * the strcmp(). For the comparison, however, Unicode Normalization specific 1839 * equivalency and Unicode simple case conversion mappings based equivalency 1840 * can be requested and checked against. 1841 */ 1842int 1843u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, 1844 int *errnum) 1845{ 1846 int f; 1847 size_t n1; 1848 size_t n2; 1849 1850 *errnum = 0; 1851 1852 /* 1853 * Check on the requested Unicode version, case conversion, and 1854 * normalization flag values. 1855 */ 1856 1857 if (uv > U8_UNICODE_LATEST) { 1858 *errnum = ERANGE; 1859 uv = U8_UNICODE_LATEST; 1860 } 1861 1862 if (flag == 0) { 1863 flag = U8_STRCMP_CS; 1864 } else { 1865 f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | 1866 U8_STRCMP_CI_LOWER); 1867 if (f == 0) { 1868 flag |= U8_STRCMP_CS; 1869 } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && 1870 f != U8_STRCMP_CI_LOWER) { 1871 *errnum = EBADF; 1872 flag = U8_STRCMP_CS; 1873 } 1874 1875 f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1876 if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && 1877 f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { 1878 *errnum = EBADF; 1879 flag = U8_STRCMP_CS; 1880 } 1881 } 1882 1883 if (flag == U8_STRCMP_CS) { 1884 return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); 1885 } 1886 1887 n1 = strlen(s1); 1888 n2 = strlen(s2); 1889 if (n != 0) { 1890 if (n < n1) 1891 n1 = n; 1892 if (n < n2) 1893 n2 = n; 1894 } 1895 1896 /* 1897 * Simple case conversion can be done much faster and so we do 1898 * them separately here. 1899 */ 1900 if (flag == U8_STRCMP_CI_UPPER) { 1901 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1902 n1, n2, B_TRUE, errnum)); 1903 } else if (flag == U8_STRCMP_CI_LOWER) { 1904 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1905 n1, n2, B_FALSE, errnum)); 1906 } 1907 1908 return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, 1909 flag, errnum)); 1910} 1911 1912size_t 1913u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, 1914 int flag, size_t unicode_version, int *errnum) 1915{ 1916 int f; 1917 int sz; 1918 uchar_t *ib; 1919 uchar_t *ibtail; 1920 uchar_t *ob; 1921 uchar_t *obtail; 1922 boolean_t do_not_ignore_null; 1923 boolean_t do_not_ignore_invalid; 1924 boolean_t is_it_toupper; 1925 boolean_t is_it_tolower; 1926 boolean_t canonical_decomposition; 1927 boolean_t compatibility_decomposition; 1928 boolean_t canonical_composition; 1929 size_t ret_val; 1930 size_t i; 1931 size_t j; 1932 uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; 1933 u8_normalization_states_t state; 1934 1935 if (unicode_version > U8_UNICODE_LATEST) { 1936 *errnum = ERANGE; 1937 return ((size_t)-1); 1938 } 1939 1940 f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); 1941 if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { 1942 *errnum = EBADF; 1943 return ((size_t)-1); 1944 } 1945 1946 f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1947 if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && 1948 f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { 1949 *errnum = EBADF; 1950 return ((size_t)-1); 1951 } 1952 1953 if (inarray == NULL || *inlen == 0) 1954 return (0); 1955 1956 if (outarray == NULL) { 1957 *errnum = E2BIG; 1958 return ((size_t)-1); 1959 } 1960 1961 ib = (uchar_t *)inarray; 1962 ob = (uchar_t *)outarray; 1963 ibtail = ib + *inlen; 1964 obtail = ob + *outlen; 1965 1966 do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); 1967 do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); 1968 is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 1969 is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 1970 1971 ret_val = 0; 1972 1973 /* 1974 * If we don't have a normalization flag set, we do the simple case 1975 * conversion based text preparation separately below. Text 1976 * preparation involving Normalization will be done in the false task 1977 * block, again, separately since it will take much more time and 1978 * resource than doing simple case conversions. 1979 */ 1980 if (f == 0) { 1981 while (ib < ibtail) { 1982 if (*ib == '\0' && do_not_ignore_null) 1983 break; 1984 1985 sz = u8_number_of_bytes[*ib]; 1986 1987 if (sz < 0) { 1988 if (do_not_ignore_invalid) { 1989 *errnum = EILSEQ; 1990 ret_val = (size_t)-1; 1991 break; 1992 } 1993 1994 sz = 1; 1995 ret_val++; 1996 } 1997 1998 if (sz == 1) { 1999 if (ob >= obtail) { 2000 *errnum = E2BIG; 2001 ret_val = (size_t)-1; 2002 break; 2003 } 2004 2005 if (is_it_toupper) 2006 *ob = U8_ASCII_TOUPPER(*ib); 2007 else if (is_it_tolower) 2008 *ob = U8_ASCII_TOLOWER(*ib); 2009 else 2010 *ob = *ib; 2011 ib++; 2012 ob++; 2013 } else if ((ib + sz) > ibtail) { 2014 if (do_not_ignore_invalid) { 2015 *errnum = EINVAL; 2016 ret_val = (size_t)-1; 2017 break; 2018 } 2019 2020 if ((obtail - ob) < (ibtail - ib)) { 2021 *errnum = E2BIG; 2022 ret_val = (size_t)-1; 2023 break; 2024 } 2025 2026 /* 2027 * We treat the remaining incomplete character 2028 * bytes as a character. 2029 */ 2030 ret_val++; 2031 2032 while (ib < ibtail) 2033 *ob++ = *ib++; 2034 } else { 2035 if (is_it_toupper || is_it_tolower) { 2036 i = do_case_conv(unicode_version, u8s, 2037 ib, sz, is_it_toupper); 2038 2039 if ((obtail - ob) < i) { 2040 *errnum = E2BIG; 2041 ret_val = (size_t)-1; 2042 break; 2043 } 2044 2045 ib += sz; 2046 2047 for (sz = 0; sz < i; sz++) 2048 *ob++ = u8s[sz]; 2049 } else { 2050 if ((obtail - ob) < sz) { 2051 *errnum = E2BIG; 2052 ret_val = (size_t)-1; 2053 break; 2054 } 2055 2056 for (i = 0; i < sz; i++) 2057 *ob++ = *ib++; 2058 } 2059 } 2060 } 2061 } else { 2062 canonical_decomposition = flag & U8_CANON_DECOMP; 2063 compatibility_decomposition = flag & U8_COMPAT_DECOMP; 2064 canonical_composition = flag & U8_CANON_COMP; 2065 2066 while (ib < ibtail) { 2067 if (*ib == '\0' && do_not_ignore_null) 2068 break; 2069 2070 /* 2071 * If the current character is a 7-bit ASCII 2072 * character and it is the last character, or, 2073 * if the current character is a 7-bit ASCII 2074 * character and the next character is also a 7-bit 2075 * ASCII character, then, we copy over this 2076 * character without going through collect_a_seq(). 2077 * 2078 * In any other cases, we need to look further with 2079 * the collect_a_seq() function. 2080 */ 2081 if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || 2082 ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { 2083 if (ob >= obtail) { 2084 *errnum = E2BIG; 2085 ret_val = (size_t)-1; 2086 break; 2087 } 2088 2089 if (is_it_toupper) 2090 *ob = U8_ASCII_TOUPPER(*ib); 2091 else if (is_it_tolower) 2092 *ob = U8_ASCII_TOLOWER(*ib); 2093 else 2094 *ob = *ib; 2095 ib++; 2096 ob++; 2097 } else { 2098 *errnum = 0; 2099 state = U8_STATE_START; 2100 2101 j = collect_a_seq(unicode_version, u8s, 2102 &ib, ibtail, 2103 is_it_toupper, 2104 is_it_tolower, 2105 canonical_decomposition, 2106 compatibility_decomposition, 2107 canonical_composition, 2108 errnum, &state); 2109 2110 if (*errnum && do_not_ignore_invalid) { 2111 ret_val = (size_t)-1; 2112 break; 2113 } 2114 2115 if ((obtail - ob) < j) { 2116 *errnum = E2BIG; 2117 ret_val = (size_t)-1; 2118 break; 2119 } 2120 2121 for (i = 0; i < j; i++) 2122 *ob++ = u8s[i]; 2123 } 2124 } 2125 } 2126 2127 *inlen = ibtail - ib; 2128 *outlen = obtail - ob; 2129 2130 return (ret_val); 2131} 2132