1/* 2 * Copyright (c) 2000-2013 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 File: UnicodeWrappers.c 30 31 Contains: Wrapper routines for Unicode conversion and comparison. 32 33*/ 34 35#if HFS 36#include <sys/param.h> 37#include <sys/utfconv.h> 38 39#include "../../hfs_macos_defs.h" 40#include "UCStringCompareData.h" 41 42#include "../headers/FileMgrInternal.h" 43#include "../headers/HFSUnicodeWrappers.h" 44 45enum { 46 kMinFileExtensionChars = 1, /* does not include dot */ 47 kMaxFileExtensionChars = 5 /* does not include dot */ 48}; 49 50 51#define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \ 52 ((c) >= 0x41 && (c) <= 0x5A) || \ 53 ((c) >= 0x30 && (c) <= 0x39)) 54 55 56#define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \ 57 ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F')) 58 59 60static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); 61 62 63static u_int32_t HexStringToInteger( u_int32_t length, const u_int8_t *hexStr ); 64 65 66/* 67 * Get filename extension (if any) as a C string 68 */ 69static void 70GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr) 71{ 72 u_int32_t i; 73 UniChar c; 74 u_int16_t extChars; /* number of extension chars (excluding dot) */ 75 u_int16_t maxExtChars; 76 Boolean foundExtension; 77 78 extStr[0] = '\0'; /* assume there's no extension */ 79 80 if ( length < 3 ) 81 return; /* "x.y" is smallest possible extension */ 82 83 if ( length < (kMaxFileExtensionChars + 2) ) 84 maxExtChars = length - 2; /* save room for prefix + dot */ 85 else 86 maxExtChars = kMaxFileExtensionChars; 87 88 i = length; 89 extChars = 0; 90 foundExtension = false; 91 92 while ( extChars <= maxExtChars ) { 93 c = unicodeStr[--i]; 94 95 /* look for leading dot */ 96 if ( c == (UniChar) '.' ) { 97 if ( extChars > 0 ) /* cannot end with a dot */ 98 foundExtension = true; 99 break; 100 } 101 102 if ( EXTENSIONCHAR(c) ) 103 ++extChars; 104 else 105 break; 106 } 107 108 /* if we found one then copy it */ 109 if ( foundExtension ) { 110 u_int8_t *extStrPtr = (u_int8_t *)extStr; 111 const UniChar *unicodeStrPtr = &unicodeStr[i]; 112 113 for ( i = 0; i <= extChars; ++i ) 114 *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++); 115 extStr[extChars + 1] = '\0'; /* terminate extension + dot */ 116 } 117} 118 119 120 121/* 122 * Count filename extension characters (if any) 123 */ 124__private_extern__ u_int32_t 125CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ) 126{ 127 u_int32_t i; 128 UniChar c; 129 u_int32_t extChars; /* number of extension chars (excluding dot) */ 130 u_int16_t maxExtChars; 131 Boolean foundExtension; 132 133 if ( length < 3 ) 134 return 0; /* "x.y" is smallest possible extension */ 135 136 if ( length < (kMaxFileExtensionChars + 2) ) 137 maxExtChars = length - 2; /* save room for prefix + dot */ 138 else 139 maxExtChars = kMaxFileExtensionChars; 140 141 extChars = 0; /* assume there's no extension */ 142 i = length - 1; /* index to last ascii character */ 143 foundExtension = false; 144 145 while ( extChars <= maxExtChars ) { 146 c = filename[i--]; 147 148 /* look for leading dot */ 149 if ( c == (u_int8_t) '.' ) { 150 if ( extChars > 0 ) /* cannot end with a dot */ 151 return (extChars); 152 153 break; 154 } 155 156 if ( EXTENSIONCHAR(c) ) 157 ++extChars; 158 else 159 break; 160 } 161 162 return 0; 163} 164 165 166/* 167 * extract the file id from a mangled name 168 */ 169HFSCatalogNodeID 170GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength) 171{ 172 short extChars; 173 short i; 174 u_int8_t c; 175 176 *prefixLength = 0; 177 178 if ( filename == NULL ) 179 return 0; 180 181 if ( length < 28 ) 182 return 0; /* too small to have been mangled */ 183 184 /* big enough for a file ID (#10) and an extension (.x) ? */ 185 if ( length > 5 ) 186 extChars = CountFilenameExtensionChars(filename, length); 187 else 188 extChars = 0; 189 190 /* skip over dot plus extension characters */ 191 if ( extChars > 0 ) 192 length -= (extChars + 1); 193 194 /* scan for file id digits */ 195 for ( i = length - 1; i >= 0; --i) { 196 c = filename[i]; 197 198 /* look for file ID marker */ 199 if ( c == '#' ) { 200 if ( (length - i) < 3 ) 201 break; /* too small to be a file ID */ 202 203 *prefixLength = i; 204 return HexStringToInteger(length - i - 1, &filename[i+1]); 205 } 206 207 if ( !IsHexDigit(c) ) 208 break; /* file ID string must have hex digits */ 209 } 210 211 return 0; 212} 213 214 215 216static u_int32_t 217HexStringToInteger(u_int32_t length, const u_int8_t *hexStr) 218{ 219 u_int32_t value; 220 u_int32_t i; 221 u_int8_t c; 222 const u_int8_t *p; 223 224 value = 0; 225 p = hexStr; 226 227 for ( i = 0; i < length; ++i ) { 228 c = *p++; 229 230 if (c >= '0' && c <= '9') { 231 value = value << 4; 232 value += (u_int32_t) c - (u_int32_t) '0'; 233 } else if (c >= 'A' && c <= 'F') { 234 value = value << 4; 235 value += 10 + ((unsigned int) c - (unsigned int) 'A'); 236 } else { 237 return 0; /* bad character */ 238 } 239 } 240 241 return value; 242} 243 244 245/* 246 * Routine: FastRelString 247 * 248 * Output: returns -1 if str1 < str2 249 * returns 1 if str1 > str2 250 * return 0 if equal 251 * 252 */ 253int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ) 254{ 255 u_int16_t* compareTable; 256 int32_t bestGuess; 257 u_int8_t length, length2; 258 u_int8_t delta; 259 260 delta = 0; 261 length = *(str1++); 262 length2 = *(str2++); 263 264 if (length == length2) 265 bestGuess = 0; 266 else if (length < length2) 267 { 268 bestGuess = -1; 269 delta = length2 - length; 270 } 271 else 272 { 273 bestGuess = 1; 274 length = length2; 275 } 276 277 compareTable = (u_int16_t*) gCompareTable; 278 279 while (length--) 280 { 281 u_int8_t aChar, bChar; 282 283 aChar = *(str1++); 284 bChar = *(str2++); 285 286 if (aChar != bChar) // If they don't match exacly, do case conversion 287 { 288 u_int16_t aSortWord, bSortWord; 289 290 aSortWord = compareTable[aChar]; 291 bSortWord = compareTable[bChar]; 292 293 if (aSortWord > bSortWord) 294 return 1; 295 296 if (aSortWord < bSortWord) 297 return -1; 298 } 299 300 // If characters match exactly, then go on to next character immediately without 301 // doing any extra work. 302 } 303 304 // if you got to here, then return bestGuess 305 return bestGuess; 306} 307 308 309 310// 311// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering 312// 313// IF RESULT 314// -------------------------- 315// str1 < str2 => -1 316// str1 = str2 => 0 317// str1 > str2 => +1 318// 319// The lower case table starts with 256 entries (one for each of the upper bytes 320// of the original Unicode char). If that entry is zero, then all characters with 321// that upper byte are already case folded. If the entry is non-zero, then it is 322// the _index_ (not byte offset) of the start of the sub-table for the characters 323// with that upper byte. All ignorable characters are folded to the value zero. 324// 325// In pseudocode: 326// 327// Let c = source Unicode character 328// Let table[] = lower case table 329// 330// lower = table[highbyte(c)] 331// if (lower == 0) 332// lower = c 333// else 334// lower = table[lower+lowbyte(c)] 335// 336// if (lower == 0) 337// ignore this character 338// 339// To handle ignorable characters, we now need a loop to find the next valid character. 340// Also, we can't pre-compute the number of characters to compare; the string length might 341// be larger than the number of non-ignorable characters. Further, we must be able to handle 342// ignorable characters at any point in the string, including as the first or last characters. 343// We use a zero value as a sentinel to detect both end-of-string and ignorable characters. 344// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, 345// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is 346// an invalid Unicode character). 347// 348// Pseudocode: 349// 350// while (1) { 351// c1 = GetNextValidChar(str1) // returns zero if at end of string 352// c2 = GetNextValidChar(str2) 353// 354// if (c1 != c2) break // found a difference 355// 356// if (c1 == 0) // reached end of string on both strings at once? 357// return 0; // yes, so strings are equal 358// } 359// 360// // When we get here, c1 != c2. So, we just need to determine which one is less. 361// if (c1 < c2) 362// return -1; 363// else 364// return 1; 365// 366 367int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, 368 register ConstUniCharArrayPtr str2, register ItemCount length2) 369{ 370 register u_int16_t c1,c2; 371 register u_int16_t temp; 372 register u_int16_t* lowerCaseTable; 373 374 lowerCaseTable = (u_int16_t*) gLowerCaseTable; 375 376 while (1) { 377 /* Set default values for c1, c2 in case there are no more valid chars */ 378 c1 = 0; 379 c2 = 0; 380 381 /* Find next non-ignorable char from str1, or zero if no more */ 382 while (length1 && c1 == 0) { 383 c1 = *(str1++); 384 --length1; 385 /* check for basic latin first */ 386 if (c1 < 0x0100) { 387 c1 = gLatinCaseFold[c1]; 388 break; 389 } 390 /* case fold if neccessary */ 391 if ((temp = lowerCaseTable[c1>>8]) != 0) 392 c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; 393 } 394 395 396 /* Find next non-ignorable char from str2, or zero if no more */ 397 while (length2 && c2 == 0) { 398 c2 = *(str2++); 399 --length2; 400 /* check for basic latin first */ 401 if (c2 < 0x0100) { 402 c2 = gLatinCaseFold[c2]; 403 break; 404 } 405 /* case fold if neccessary */ 406 if ((temp = lowerCaseTable[c2>>8]) != 0) 407 c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; 408 } 409 410 if (c1 != c2) // found a difference, so stop looping 411 break; 412 413 if (c1 == 0) // did we reach the end of both strings at the same time? 414 return 0; // yes, so strings are equal 415 } 416 417 if (c1 < c2) 418 return -1; 419 else 420 return 1; 421} 422 423/* 424 * UnicodeBinaryCompare 425 * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them. 426 * 427 * Results are emitted like FastUnicodeCompare: 428 * 429 * 430 * IF RESULT 431 * -------------------------- 432 * str1 < str2 => -1 433 * str1 = str2 => 0 434 * str1 > str2 => +1 435 * 436 * The case matching source code is greatly simplified due to the lack of case-folding 437 * in this comparison routine. We compare, in order: the lengths, then do character-by- 438 * character comparisons. 439 * 440 */ 441int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1, 442 register ConstUniCharArrayPtr str2, register ItemCount len2) { 443 uint16_t c1; 444 uint16_t c2; 445 int string_length; 446 int32_t result = 0; 447 448 /* Set default values for the two character pointers */ 449 c1 = 0; 450 c2 = 0; 451 452 /* First generate the string length (for comparison purposes) */ 453 if (len1 < len2) { 454 string_length = len1; 455 --result; 456 } 457 else if (len1 > len2) { 458 string_length = len2; 459 ++result; 460 } 461 else { 462 string_length = len1; 463 } 464 465 /* now compare the two string pointers */ 466 while (string_length--) { 467 c1 = *(str1++); 468 c2 = *(str2++); 469 470 if (c1 > c2) { 471 result = 1; 472 break; 473 } 474 475 if (c1 < c2) { 476 result = -1; 477 break; 478 } 479 /* If equal, iterate to the next two respective chars */ 480 } 481 482 return result; 483} 484 485 486OSErr 487ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, 488 ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid) 489{ 490 ByteCount subMaxLen; 491 size_t utf8len; 492 char fileIDStr[15]; 493 char extStr[15]; 494 495 snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid); 496 GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); 497 498 /* remove extension chars from source */ 499 srcLen -= strlen(extStr) * sizeof(UniChar); 500 subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr)); 501 502 (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0); 503 504 strlcat((char *)dstStr, fileIDStr, maxDstLen); 505 strlcat((char *)dstStr, extStr, maxDstLen); 506 *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr)); 507 508 return noErr; 509} 510 511#else /* not HFS - temp workaround until 4277828 is fixed */ 512/* stubs for exported routines that aren't present when we build kernel without HFS */ 513 514#include <sys/types.h> 515 516int32_t FastUnicodeCompare( void * str1, u_int32_t length1, void * str2, u_int32_t length2 ); 517 518 519int32_t FastUnicodeCompare( __unused void * str1, 520 __unused u_int32_t length1, 521 __unused void * str2, 522 __unused u_int32_t length2 ) 523{ 524 return( 0 ); 525} 526 527 528#endif /* HFS */ 529 530