1/* 2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 /* 30 Includes Unicode 3.2 decomposition code derived from Core Foundation 31 */ 32 33#include <sys/param.h> 34#include <sys/utfconv.h> 35#include <sys/errno.h> 36#include <sys/malloc.h> 37#include <libkern/OSByteOrder.h> 38 39/* 40 * UTF-8 (Unicode Transformation Format) 41 * 42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode 43 * character as a sequence of one to four bytes. Only the shortest form 44 * required to represent the significant Unicode bits is legal. 45 * 46 * UTF-8 Multibyte Codes 47 * 48 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary) 49 * ----------------------------------------------------------------------------- 50 * 1 7 0x0000 0x007F 0xxxxxxx 51 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx 52 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 53 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 54 * ----------------------------------------------------------------------------- 55 */ 56 57 58#define UNICODE_TO_UTF8_LEN(c) \ 59 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3))) 60 61#define UCS_ALT_NULL 0x2400 62 63/* Surrogate Pair Constants */ 64#define SP_HALF_SHIFT 10 65#define SP_HALF_BASE 0x0010000UL 66#define SP_HALF_MASK 0x3FFUL 67 68#define SP_HIGH_FIRST 0xD800UL 69#define SP_HIGH_LAST 0xDBFFUL 70#define SP_LOW_FIRST 0xDC00UL 71#define SP_LOW_LAST 0xDFFFUL 72 73 74#include "vfs_utfconvdata.h" 75 76 77/* 78 * Test for a combining character. 79 * 80 * Similar to __CFUniCharIsNonBaseCharacter except that 81 * unicode_combinable also includes Hangul Jamo characters. 82 */ 83int 84unicode_combinable(u_int16_t character) 85{ 86 const u_int8_t *bitmap = __CFUniCharCombiningBitmap; 87 u_int8_t value; 88 89 if (character < 0x0300) 90 return (0); 91 92 value = bitmap[(character >> 8) & 0xFF]; 93 94 if (value == 0xFF) { 95 return (1); 96 } else if (value) { 97 bitmap = bitmap + ((value - 1) * 32) + 256; 98 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0); 99 } 100 return (0); 101} 102 103/* 104 * Test for a precomposed character. 105 * 106 * Similar to __CFUniCharIsDecomposableCharacter. 107 */ 108int 109unicode_decomposeable(u_int16_t character) { 110 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap; 111 u_int8_t value; 112 113 if (character < 0x00C0) 114 return (0); 115 116 value = bitmap[(character >> 8) & 0xFF]; 117 118 if (value == 0xFF) { 119 return (1); 120 } else if (value) { 121 bitmap = bitmap + ((value - 1) * 32) + 256; 122 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0); 123 } 124 return (0); 125} 126 127 128/* 129 * Get the combing class. 130 * 131 * Similar to CFUniCharGetCombiningPropertyForCharacter. 132 */ 133static inline u_int8_t 134get_combining_class(u_int16_t character) { 135 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap; 136 137 u_int8_t value = bitmap[(character >> 8)]; 138 139 if (value) { 140 bitmap = bitmap + (value * 256); 141 return bitmap[character % 256]; 142 } 143 return (0); 144} 145 146 147static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars); 148 149static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining); 150 151static void priortysort(u_int16_t* characters, int count); 152 153static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar); 154 155static u_int16_t sfm_to_ucs(u_int16_t ucs_ch); 156 157 158char utf_extrabytes[32] = { 159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1 161}; 162 163const char hexdigits[16] = { 164 '0', '1', '2', '3', '4', '5', '6', '7', 165 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 166}; 167 168/* 169 * utf8_encodelen - Calculate the UTF-8 encoding length 170 * 171 * This function takes a Unicode input string, ucsp, of ucslen bytes 172 * and calculates the size of the UTF-8 output in bytes (not including 173 * a NULL termination byte). The string must reside in kernel memory. 174 * 175 * If '/' chars are possible in the Unicode input then an alternate 176 * (replacement) char should be provided in altslash. 177 * 178 * FLAGS 179 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime 180 * 181 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 182 * 183 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 184 * 185 * UTF_DECOMPOSED: generate fully decomposed output 186 * 187 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it 188 * 189 * ERRORS 190 * None 191 */ 192size_t 193utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags) 194{ 195 u_int16_t ucs_ch; 196 u_int16_t * chp = NULL; 197 u_int16_t sequence[8]; 198 int extra = 0; 199 int charcnt; 200 int swapbytes = (flags & UTF_REVERSE_ENDIAN); 201 int decompose = (flags & UTF_DECOMPOSED); 202 size_t len; 203 204 charcnt = ucslen / 2; 205 len = 0; 206 207 while (charcnt-- > 0) { 208 if (extra > 0) { 209 --extra; 210 ucs_ch = *chp++; 211 } else { 212 ucs_ch = *ucsp++; 213 if (swapbytes) { 214 ucs_ch = OSSwapInt16(ucs_ch); 215 } 216 if (ucs_ch == '/') { 217 ucs_ch = altslash ? altslash : '_'; 218 } else if (ucs_ch == '\0') { 219 ucs_ch = UCS_ALT_NULL; 220 } else if (decompose && unicode_decomposeable(ucs_ch)) { 221 extra = unicode_decompose(ucs_ch, sequence) - 1; 222 charcnt += extra; 223 ucs_ch = sequence[0]; 224 chp = &sequence[1]; 225 } 226 } 227 len += UNICODE_TO_UTF8_LEN(ucs_ch); 228 } 229 230 return (len); 231} 232 233 234/* 235 * utf8_encodestr - Encodes a Unicode string to UTF-8 236 * 237 * NOTES: 238 * The resulting UTF-8 string is NULL terminated. 239 * 240 * If '/' chars are allowed on disk then an alternate 241 * (replacement) char must be provided in altslash. 242 * 243 * input flags: 244 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime 245 * 246 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 247 * 248 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 249 * 250 * UTF_DECOMPOSED: generate fully decomposed output 251 * 252 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output 253 * 254 * result: 255 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded 256 * 257 * EINVAL: Illegal char found; char was replaced by an '_'. 258 */ 259int 260utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, 261 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags) 262{ 263 u_int8_t * bufstart; 264 u_int8_t * bufend; 265 u_int16_t ucs_ch; 266 u_int16_t * chp = NULL; 267 u_int16_t sequence[8]; 268 int extra = 0; 269 int charcnt; 270 int swapbytes = (flags & UTF_REVERSE_ENDIAN); 271 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0); 272 int decompose = (flags & UTF_DECOMPOSED); 273 int sfmconv = (flags & UTF_SFM_CONVERSIONS); 274 int result = 0; 275 276 bufstart = utf8p; 277 bufend = bufstart + buflen; 278 if (nullterm) 279 --bufend; 280 charcnt = ucslen / 2; 281 282 while (charcnt-- > 0) { 283 if (extra > 0) { 284 --extra; 285 ucs_ch = *chp++; 286 } else { 287 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++; 288 289 if (decompose && unicode_decomposeable(ucs_ch)) { 290 extra = unicode_decompose(ucs_ch, sequence) - 1; 291 charcnt += extra; 292 ucs_ch = sequence[0]; 293 chp = &sequence[1]; 294 } 295 } 296 297 /* Slash and NULL are not permitted */ 298 if (ucs_ch == '/') { 299 if (altslash) 300 ucs_ch = altslash; 301 else { 302 ucs_ch = '_'; 303 result = EINVAL; 304 } 305 } else if (ucs_ch == '\0') { 306 ucs_ch = UCS_ALT_NULL; 307 } 308 309 if (ucs_ch < 0x0080) { 310 if (utf8p >= bufend) { 311 result = ENAMETOOLONG; 312 break; 313 } 314 *utf8p++ = ucs_ch; 315 316 } else if (ucs_ch < 0x800) { 317 if ((utf8p + 1) >= bufend) { 318 result = ENAMETOOLONG; 319 break; 320 } 321 *utf8p++ = 0xc0 | (ucs_ch >> 6); 322 *utf8p++ = 0x80 | (0x3f & ucs_ch); 323 324 } else { 325 /* These chars never valid Unicode. */ 326 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) { 327 result = EINVAL; 328 break; 329 } 330 331 /* Combine valid surrogate pairs */ 332 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST 333 && charcnt > 0) { 334 u_int16_t ch2; 335 u_int32_t pair; 336 337 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp; 338 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) { 339 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT) 340 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE; 341 if ((utf8p + 3) >= bufend) { 342 result = ENAMETOOLONG; 343 break; 344 } 345 --charcnt; 346 ++ucsp; 347 *utf8p++ = 0xf0 | (pair >> 18); 348 *utf8p++ = 0x80 | (0x3f & (pair >> 12)); 349 *utf8p++ = 0x80 | (0x3f & (pair >> 6)); 350 *utf8p++ = 0x80 | (0x3f & pair); 351 continue; 352 } 353 } else if (sfmconv) { 354 ucs_ch = sfm_to_ucs(ucs_ch); 355 if (ucs_ch < 0x0080) { 356 if (utf8p >= bufend) { 357 result = ENAMETOOLONG; 358 break; 359 } 360 *utf8p++ = ucs_ch; 361 continue; 362 } 363 } 364 if ((utf8p + 2) >= bufend) { 365 result = ENAMETOOLONG; 366 break; 367 } 368 *utf8p++ = 0xe0 | (ucs_ch >> 12); 369 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6)); 370 *utf8p++ = 0x80 | (0x3f & ucs_ch); 371 } 372 } 373 374 *utf8len = utf8p - bufstart; 375 if (nullterm) 376 *utf8p++ = '\0'; 377 378 return (result); 379} 380 381 382/* 383 * utf8_decodestr - Decodes a UTF-8 string back to Unicode 384 * 385 * NOTES: 386 * The input UTF-8 string does not need to be null terminated 387 * if utf8len is set. 388 * 389 * If '/' chars are allowed on disk then an alternate 390 * (replacement) char must be provided in altslash. 391 * 392 * input flags: 393 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime 394 * 395 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 396 * 397 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 398 * 399 * UTF_DECOMPOSED: generate fully decomposed output (NFD) 400 * 401 * UTF_PRECOMPOSED: generate precomposed output (NFC) 402 * 403 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input 404 * 405 * result: 406 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded. 407 * 408 * EINVAL: Illegal UTF-8 sequence found. 409 */ 410int 411utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, 412 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags) 413{ 414 u_int16_t* bufstart; 415 u_int16_t* bufend; 416 unsigned int ucs_ch; 417 unsigned int byte; 418 int combcharcnt = 0; 419 int result = 0; 420 int decompose, precompose, swapbytes, escaping; 421 int sfmconv; 422 int extrabytes; 423 424 decompose = (flags & UTF_DECOMPOSED); 425 precompose = (flags & UTF_PRECOMPOSED); 426 swapbytes = (flags & UTF_REVERSE_ENDIAN); 427 escaping = (flags & UTF_ESCAPE_ILLEGAL); 428 sfmconv = (flags & UTF_SFM_CONVERSIONS); 429 430 bufstart = ucsp; 431 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen); 432 433 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') { 434 if (ucsp >= bufend) 435 goto toolong; 436 437 /* check for ascii */ 438 if (byte < 0x80) { 439 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte; 440 } else { 441 u_int32_t ch; 442 443 extrabytes = utf_extrabytes[byte >> 3]; 444 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) { 445 goto escape; 446 } 447 utf8len -= extrabytes; 448 449 switch (extrabytes) { 450 case 1: 451 ch = byte; ch <<= 6; /* 1st byte */ 452 byte = *utf8p++; /* 2nd byte */ 453 if ((byte >> 6) != 2) 454 goto escape2; 455 ch += byte; 456 ch -= 0x00003080UL; 457 if (ch < 0x0080) 458 goto escape2; 459 ucs_ch = ch; 460 break; 461 case 2: 462 ch = byte; ch <<= 6; /* 1st byte */ 463 byte = *utf8p++; /* 2nd byte */ 464 if ((byte >> 6) != 2) 465 goto escape2; 466 ch += byte; ch <<= 6; 467 byte = *utf8p++; /* 3rd byte */ 468 if ((byte >> 6) != 2) 469 goto escape3; 470 ch += byte; 471 ch -= 0x000E2080UL; 472 if (ch < 0x0800) 473 goto escape3; 474 if (ch >= 0xD800) { 475 if (ch <= 0xDFFF) 476 goto escape3; 477 if (ch == 0xFFFE || ch == 0xFFFF) 478 goto escape3; 479 } 480 ucs_ch = ch; 481 break; 482 case 3: 483 ch = byte; ch <<= 6; /* 1st byte */ 484 byte = *utf8p++; /* 2nd byte */ 485 if ((byte >> 6) != 2) 486 goto escape2; 487 ch += byte; ch <<= 6; 488 byte = *utf8p++; /* 3rd byte */ 489 if ((byte >> 6) != 2) 490 goto escape3; 491 ch += byte; ch <<= 6; 492 byte = *utf8p++; /* 4th byte */ 493 if ((byte >> 6) != 2) 494 goto escape4; 495 ch += byte; 496 ch -= 0x03C82080UL + SP_HALF_BASE; 497 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST; 498 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST) 499 goto escape4; 500 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 501 if (ucsp >= bufend) 502 goto toolong; 503 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST; 504 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) { 505 --ucsp; 506 goto escape4; 507 } 508 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 509 continue; 510 default: 511 result = EINVAL; 512 goto exit; 513 } 514 if (decompose) { 515 if (unicode_decomposeable(ucs_ch)) { 516 u_int16_t sequence[8]; 517 int count, i; 518 519 /* Before decomposing a new unicode character, sort 520 * previous combining characters, if any, and reset 521 * the counter. 522 */ 523 if (combcharcnt > 1) { 524 priortysort(ucsp - combcharcnt, combcharcnt); 525 } 526 combcharcnt = 0; 527 528 count = unicode_decompose(ucs_ch, sequence); 529 for (i = 0; i < count; ++i) { 530 ucs_ch = sequence[i]; 531 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 532 if (ucsp >= bufend) 533 goto toolong; 534 } 535 combcharcnt += count - 1; 536 continue; 537 } 538 } else if (precompose && (ucsp != bufstart)) { 539 u_int16_t composite, base; 540 541 if (unicode_combinable(ucs_ch)) { 542 base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1); 543 composite = unicode_combine(base, ucs_ch); 544 if (composite) { 545 --ucsp; 546 ucs_ch = composite; 547 } 548 } 549 } 550 if (ucs_ch == UCS_ALT_NULL) 551 ucs_ch = '\0'; 552 } 553 if (ucs_ch == altslash) 554 ucs_ch = '/'; 555 556 /* 557 * Make multiple combining character sequences canonical 558 */ 559 if (unicode_combinable(ucs_ch)) { 560 ++combcharcnt; /* start tracking a run */ 561 } else if (combcharcnt) { 562 if (combcharcnt > 1) { 563 priortysort(ucsp - combcharcnt, combcharcnt); 564 } 565 combcharcnt = 0; /* start over */ 566 } 567 568 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 569 continue; 570 571 /* 572 * Escape illegal UTF-8 into something legal. 573 */ 574escape4: 575 utf8p -= 3; 576 goto escape; 577escape3: 578 utf8p -= 2; 579 goto escape; 580escape2: 581 utf8p -= 1; 582escape: 583 if (!escaping) { 584 result = EINVAL; 585 goto exit; 586 } 587 if (extrabytes > 0) 588 utf8len += extrabytes; 589 byte = *(utf8p - 1); 590 591 if ((ucsp + 2) >= bufend) 592 goto toolong; 593 594 /* Make a previous combining sequence canonical. */ 595 if (combcharcnt > 1) { 596 priortysort(ucsp - combcharcnt, combcharcnt); 597 } 598 combcharcnt = 0; 599 600 ucs_ch = '%'; 601 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 602 ucs_ch = hexdigits[byte >> 4]; 603 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 604 ucs_ch = hexdigits[byte & 0x0F]; 605 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch; 606 } 607 /* 608 * Make a previous combining sequence canonical 609 */ 610 if (combcharcnt > 1) { 611 priortysort(ucsp - combcharcnt, combcharcnt); 612 } 613exit: 614 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart; 615 616 return (result); 617 618toolong: 619 result = ENAMETOOLONG; 620 goto exit; 621} 622 623 624/* 625 * utf8_validatestr - Check for a valid UTF-8 string. 626 */ 627int 628utf8_validatestr(const u_int8_t* utf8p, size_t utf8len) 629{ 630 unsigned int byte; 631 u_int32_t ch; 632 unsigned int ucs_ch; 633 size_t extrabytes; 634 635 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') { 636 if (byte < 0x80) 637 continue; /* plain ascii */ 638 639 extrabytes = utf_extrabytes[byte >> 3]; 640 641 if (utf8len < extrabytes) 642 goto invalid; 643 utf8len -= extrabytes; 644 645 switch (extrabytes) { 646 case 1: 647 ch = byte; ch <<= 6; /* 1st byte */ 648 byte = *utf8p++; /* 2nd byte */ 649 if ((byte >> 6) != 2) 650 goto invalid; 651 ch += byte; 652 ch -= 0x00003080UL; 653 if (ch < 0x0080) 654 goto invalid; 655 break; 656 case 2: 657 ch = byte; ch <<= 6; /* 1st byte */ 658 byte = *utf8p++; /* 2nd byte */ 659 if ((byte >> 6) != 2) 660 goto invalid; 661 ch += byte; ch <<= 6; 662 byte = *utf8p++; /* 3rd byte */ 663 if ((byte >> 6) != 2) 664 goto invalid; 665 ch += byte; 666 ch -= 0x000E2080UL; 667 if (ch < 0x0800) 668 goto invalid; 669 if (ch >= 0xD800) { 670 if (ch <= 0xDFFF) 671 goto invalid; 672 if (ch == 0xFFFE || ch == 0xFFFF) 673 goto invalid; 674 } 675 break; 676 case 3: 677 ch = byte; ch <<= 6; /* 1st byte */ 678 byte = *utf8p++; /* 2nd byte */ 679 if ((byte >> 6) != 2) 680 goto invalid; 681 ch += byte; ch <<= 6; 682 byte = *utf8p++; /* 3rd byte */ 683 if ((byte >> 6) != 2) 684 goto invalid; 685 ch += byte; ch <<= 6; 686 byte = *utf8p++; /* 4th byte */ 687 if ((byte >> 6) != 2) 688 goto invalid; 689 ch += byte; 690 ch -= 0x03C82080UL + SP_HALF_BASE; 691 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST; 692 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST) 693 goto invalid; 694 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST; 695 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) 696 goto invalid; 697 break; 698 default: 699 goto invalid; 700 } 701 702 } 703 return (0); 704invalid: 705 return (EINVAL); 706} 707 708/* 709 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD) 710 * 711 * This function takes an UTF-8 input string, instr, of inlen bytes 712 * and produces normalized UTF-8 output into a buffer of buflen bytes 713 * pointed to by outstr. The size of the output in bytes (not including 714 * a NULL termination byte) is returned in outlen. In-place conversions 715 * are not supported (i.e. instr != outstr).] 716 717 * FLAGS 718 * UTF_DECOMPOSED: output string will be fully decomposed (NFD) 719 * 720 * UTF_PRECOMPOSED: output string will be precomposed (NFC) 721 * 722 * UTF_NO_NULL_TERM: do not add null termination to output string 723 * 724 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input 725 * 726 * ERRORS 727 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes 728 * 729 * EINVAL: illegal UTF-8 sequence encountered or invalid flags 730 */ 731int 732utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr, 733 size_t *outlen, size_t buflen, int flags) 734{ 735 u_int16_t unicodebuf[32]; 736 u_int16_t* unistr = NULL; 737 size_t unicode_bytes; 738 size_t uft8_bytes; 739 size_t inbuflen; 740 u_int8_t *outbufstart, *outbufend; 741 const u_int8_t *inbufstart; 742 unsigned int byte; 743 int decompose, precompose; 744 int result = 0; 745 746 if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) { 747 return (EINVAL); 748 } 749 decompose = (flags & UTF_DECOMPOSED); 750 precompose = (flags & UTF_PRECOMPOSED); 751 if ((decompose && precompose) || (!decompose && !precompose)) { 752 return (EINVAL); 753 } 754 outbufstart = outstr; 755 outbufend = outbufstart + buflen; 756 inbufstart = instr; 757 inbuflen = inlen; 758 759 while (inlen-- > 0 && (byte = *instr++) != '\0') { 760 if (outstr >= outbufend) { 761 result = ENAMETOOLONG; 762 goto exit; 763 } 764 if (byte >= 0x80) { 765 goto nonASCII; 766 } 767 /* ASCII is already normalized. */ 768 *outstr++ = byte; 769 } 770exit: 771 *outlen = outstr - outbufstart; 772 if (((flags & UTF_NO_NULL_TERM) == 0)) { 773 if (outstr < outbufend) 774 *outstr++ = '\0'; 775 else 776 result = ENAMETOOLONG; 777 } 778 return (result); 779 780 781 /* 782 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr 783 * functions to perform the normalization. Since this will 784 * presumably be used to normalize filenames in the back-end 785 * (on disk or over-the-wire), it should be fast enough. 786 */ 787nonASCII: 788 789 /* Make sure the input size is reasonable. */ 790 if (inbuflen > MAXPATHLEN) { 791 result = ENAMETOOLONG; 792 goto exit; 793 } 794 /* 795 * Compute worst case Unicode buffer size. 796 * 797 * For pre-composed output, every UTF-8 input byte will be at 798 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes 799 * (smallest composite char sequence) may yield 6 Unicode bytes 800 * (1 base char + 2 combining chars). 801 */ 802 unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3); 803 804 if (unicode_bytes <= sizeof(unicodebuf)) 805 unistr = &unicodebuf[0]; 806 else 807 MALLOC(unistr, u_int16_t *, unicode_bytes, M_TEMP, M_WAITOK); 808 809 /* Normalize the string. */ 810 result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes, 811 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM); 812 if (result == 0) { 813 /* Put results back into UTF-8. */ 814 result = utf8_encodestr(unistr, unicode_bytes, outbufstart, 815 &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM); 816 outstr = outbufstart + uft8_bytes; 817 } 818 if (unistr && unistr != &unicodebuf[0]) { 819 FREE(unistr, M_TEMP); 820 } 821 goto exit; 822} 823 824 825 /* 826 * Unicode 3.2 decomposition code (derived from Core Foundation) 827 */ 828 829typedef struct { 830 u_int32_t _key; 831 u_int32_t _value; 832} unicode_mappings32; 833 834static inline u_int32_t 835getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem, 836 u_int16_t character) 837{ 838 const unicode_mappings32 *p, *q, *divider; 839 840 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) 841 return (0); 842 843 p = theTable; 844 q = p + (numElem-1); 845 while (p <= q) { 846 divider = p + ((q - p) >> 1); /* divide by 2 */ 847 if (character < divider->_key) { q = divider - 1; } 848 else if (character > divider->_key) { p = divider + 1; } 849 else { return (divider->_value); } 850 } 851 return (0); 852} 853 854#define RECURSIVE_DECOMPOSITION (1 << 15) 855#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007) 856 857typedef struct { 858 u_int16_t _key; 859 u_int16_t _value; 860} unicode_mappings16; 861 862static inline u_int16_t 863getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem, 864 u_int16_t character) 865{ 866 const unicode_mappings16 *p, *q, *divider; 867 868 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) 869 return (0); 870 871 p = theTable; 872 q = p + (numElem-1); 873 while (p <= q) { 874 divider = p + ((q - p) >> 1); /* divide by 2 */ 875 if (character < divider->_key) 876 q = divider - 1; 877 else if (character > divider->_key) 878 p = divider + 1; 879 else 880 return (divider->_value); 881 } 882 return (0); 883} 884 885 886static u_int32_t 887unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars) 888{ 889 u_int16_t value; 890 u_int32_t length; 891 u_int16_t firstChar; 892 u_int16_t theChar; 893 const u_int16_t *bmpMappings; 894 u_int32_t usedLength; 895 896 value = getmappedvalue16( 897 (const unicode_mappings16 *)__CFUniCharDecompositionTable, 898 __UniCharDecompositionTableLength, character); 899 length = EXTRACT_COUNT(value); 900 firstChar = value & 0x0FFF; 901 theChar = firstChar; 902 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar); 903 usedLength = 0; 904 905 if (value & RECURSIVE_DECOMPOSITION) { 906 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars); 907 908 --length; /* Decrement for the first char */ 909 if (!usedLength) 910 return 0; 911 ++bmpMappings; 912 convertedChars += usedLength; 913 } 914 915 usedLength += length; 916 917 while (length--) 918 *(convertedChars++) = *(bmpMappings++); 919 920 return (usedLength); 921} 922 923#define HANGUL_SBASE 0xAC00 924#define HANGUL_LBASE 0x1100 925#define HANGUL_VBASE 0x1161 926#define HANGUL_TBASE 0x11A7 927 928#define HANGUL_SCOUNT 11172 929#define HANGUL_LCOUNT 19 930#define HANGUL_VCOUNT 21 931#define HANGUL_TCOUNT 28 932#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) 933 934/* 935 * unicode_decompose - decompose a composed Unicode char 936 * 937 * Composed Unicode characters are forbidden on 938 * HFS Plus volumes. ucs_decompose will convert a 939 * composed character into its correct decomposed 940 * sequence. 941 * 942 * Similar to CFUniCharDecomposeCharacter 943 */ 944static int 945unicode_decompose(u_int16_t character, u_int16_t *convertedChars) 946{ 947 if ((character >= HANGUL_SBASE) && 948 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) { 949 u_int32_t length; 950 951 character -= HANGUL_SBASE; 952 length = (character % HANGUL_TCOUNT ? 3 : 2); 953 954 *(convertedChars++) = 955 character / HANGUL_NCOUNT + HANGUL_LBASE; 956 *(convertedChars++) = 957 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE; 958 if (length > 2) 959 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE; 960 return (length); 961 } else { 962 return (unicode_recursive_decompose(character, convertedChars)); 963 } 964} 965 966/* 967 * unicode_combine - generate a precomposed Unicode char 968 * 969 * Precomposed Unicode characters are required for some volume 970 * formats and network protocols. unicode_combine will combine 971 * a decomposed character sequence into a single precomposed 972 * (composite) character. 973 * 974 * Similar toCFUniCharPrecomposeCharacter but unicode_combine 975 * also handles Hangul Jamo characters. 976 */ 977static u_int16_t 978unicode_combine(u_int16_t base, u_int16_t combining) 979{ 980 u_int32_t value; 981 982 /* Check HANGUL */ 983 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) { 984 /* 2 char Hangul sequences */ 985 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) && 986 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) { 987 return (HANGUL_SBASE + 988 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) + 989 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT)); 990 } 991 992 /* 3 char Hangul sequences */ 993 if ((combining > HANGUL_TBASE) && 994 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) { 995 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT) 996 return (0); 997 else 998 return (base + (combining - HANGUL_TBASE)); 999 } 1000 } 1001 1002 value = getmappedvalue32( 1003 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable, 1004 __CFUniCharPrecompositionTableLength, combining); 1005 1006 if (value) { 1007 value = getmappedvalue16( 1008 (const unicode_mappings16 *) 1009 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)), 1010 (value >> 16), base); 1011 } 1012 return (value); 1013} 1014 1015 1016/* 1017 * priortysort - order combining chars into canonical order 1018 * 1019 * Similar to CFUniCharPrioritySort 1020 */ 1021static void 1022priortysort(u_int16_t* characters, int count) 1023{ 1024 u_int32_t p1, p2; 1025 u_int16_t *ch1, *ch2; 1026 u_int16_t *end; 1027 int changes = 0; 1028 1029 end = characters + count; 1030 do { 1031 changes = 0; 1032 ch1 = characters; 1033 ch2 = characters + 1; 1034 p2 = get_combining_class(*ch1); 1035 while (ch2 < end) { 1036 p1 = p2; 1037 p2 = get_combining_class(*ch2); 1038 if (p1 > p2 && p2 != 0) { 1039 u_int32_t tmp; 1040 1041 tmp = *ch1; 1042 *ch1 = *ch2; 1043 *ch2 = tmp; 1044 changes = 1; 1045 1046 /* 1047 * Make sure that p2 contains the combining class for the 1048 * character now stored at *ch2. This isn't required for 1049 * correctness, but it will be more efficient if a character 1050 * with a large combining class has to "bubble past" several 1051 * characters with lower combining classes. 1052 */ 1053 p2 = p1; 1054 } 1055 ++ch1; 1056 ++ch2; 1057 } 1058 } while (changes); 1059} 1060 1061 1062/* 1063 * Invalid NTFS filename characters are encodeded using the 1064 * SFM (Services for Macintosh) private use Unicode characters. 1065 * 1066 * These should only be used for SMB, MSDOS or NTFS. 1067 * 1068 * Illegal NTFS Char SFM Unicode Char 1069 * ---------------------------------------- 1070 * 0x01-0x1f 0xf001-0xf01f 1071 * '"' 0xf020 1072 * '*' 0xf021 1073 * '/' 0xf022 1074 * '<' 0xf023 1075 * '>' 0xf024 1076 * '?' 0xf025 1077 * '\' 0xf026 1078 * '|' 0xf027 1079 * ' ' 0xf028 (Only if last char of the name) 1080 * '.' 0xf029 (Only if last char of the name) 1081 * ---------------------------------------- 1082 * 1083 * Reference: http://support.microsoft.com/kb/q117258/ 1084 */ 1085 1086#define MAX_SFM2MAC 0x29 1087#define SFMCODE_PREFIX_MASK 0xf000 1088 1089/* 1090 * In the Mac OS 9 days the colon was illegal in a file name. For that reason 1091 * SFM had no conversion for the colon. There is a conversion for the 1092 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon 1093 * is a slash and a slash is a colon. So we can just replace the slash with the 1094 * colon in our tables and everything will just work. 1095 */ 1096static u_int8_t 1097sfm2mac[42] = { 1098 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */ 1099 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */ 1100 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */ 1101 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */ 1102 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */ 1103 0x20, 0x2e /* 28 - 29 */ 1104}; 1105 1106static u_int8_t 1107mac2sfm[112] = { 1108 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */ 1109 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */ 1110 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */ 1111 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */ 1112 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */ 1113 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */ 1114 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */ 1115 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */ 1116 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */ 1117 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */ 1118 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */ 1119 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */ 1120}; 1121 1122 1123/* 1124 * Encode illegal NTFS filename characters into SFM Private Unicode characters 1125 * 1126 * Assumes non-zero ASCII input. 1127 */ 1128static u_int16_t 1129ucs_to_sfm(u_int16_t ucs_ch, int lastchar) 1130{ 1131 /* The last character of filename cannot be a space or period. */ 1132 if (lastchar) { 1133 if (ucs_ch == 0x20) 1134 return (0xf028); 1135 else if (ucs_ch == 0x2e) 1136 return (0xf029); 1137 } 1138 /* 0x01 - 0x1f is simple transformation. */ 1139 if (ucs_ch <= 0x1f) { 1140 return (ucs_ch | 0xf000); 1141 } else /* 0x20 - 0x7f */ { 1142 u_int16_t lsb; 1143 1144 lsb = mac2sfm[ucs_ch - 0x0020]; 1145 if (lsb != ucs_ch) 1146 return(0xf000 | lsb); 1147 } 1148 return (ucs_ch); 1149} 1150 1151/* 1152 * Decode any SFM Private Unicode characters 1153 */ 1154static u_int16_t 1155sfm_to_ucs(u_int16_t ucs_ch) 1156{ 1157 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) && 1158 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) { 1159 ucs_ch = sfm2mac[ucs_ch & 0x003f]; 1160 } 1161 return (ucs_ch); 1162} 1163 1164 1165