1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFStringUtilities.c 25 Copyright (c) 1999-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include "CFInternal.h" 30#include <CoreFoundation/CFStringEncodingConverterExt.h> 31#include <CoreFoundation/CFUniChar.h> 32#include <CoreFoundation/CFStringEncodingExt.h> 33#include "CFStringEncodingDatabase.h" 34#include "CFICUConverters.h" 35#include <limits.h> 36#include <stdlib.h> 37#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 38#include <unicode/ucol.h> 39#include <unicode/ucoleitr.h> 40#endif 41#include <string.h> 42 43#if DEPLOYMENT_TARGET_WINDOWS 44#include <tchar.h> 45#endif 46 47 48Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) { 49 switch (theEncoding) { 50 case kCFStringEncodingASCII: // Built-in encodings 51 case kCFStringEncodingMacRoman: 52 case kCFStringEncodingUTF8: 53 case kCFStringEncodingNonLossyASCII: 54 case kCFStringEncodingWindowsLatin1: 55 case kCFStringEncodingNextStepLatin: 56 case kCFStringEncodingUTF16: 57 case kCFStringEncodingUTF16BE: 58 case kCFStringEncodingUTF16LE: 59 case kCFStringEncodingUTF32: 60 case kCFStringEncodingUTF32BE: 61 case kCFStringEncodingUTF32LE: 62 return true; 63 64 default: 65 return CFStringEncodingIsValidEncoding(theEncoding); 66 } 67} 68 69const CFStringEncoding* CFStringGetListOfAvailableEncodings() { 70 return (const CFStringEncoding *)CFStringEncodingListOfAvailableEncodings(); 71} 72 73CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) { 74 static CFMutableDictionaryRef mappingTable = NULL; 75 CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding) : NULL; 76 77 if (!theName) { 78 const char *encodingName = __CFStringEncodingGetName(theEncoding); 79 80 if (encodingName) { 81 theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, encodingName, kCFStringEncodingASCII); 82 } 83 84 if (theName) { 85 if (!mappingTable) mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks); 86 87 CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)theEncoding, (const void*)theName); 88 CFRelease(theName); 89 } 90 } 91 92 return theName; 93} 94 95CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) { 96 CFStringEncoding encoding = kCFStringEncodingInvalidId; 97#define BUFFER_SIZE (100) 98 char buffer[BUFFER_SIZE]; 99 const char *name = CFStringGetCStringPtr(charsetName, __CFStringGetEightBitStringEncoding()); 100 101 if (NULL == name) { 102 if (false == CFStringGetCString(charsetName, buffer, BUFFER_SIZE, __CFStringGetEightBitStringEncoding())) return kCFStringEncodingInvalidId; 103 104 name = buffer; 105 } 106 107 encoding = __CFStringEncodingGetFromCanonicalName(name); 108 109#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 110 if (kCFStringEncodingInvalidId == encoding) encoding = __CFStringEncodingGetFromICUName(name); 111#endif 112 113 114 // handling Java name variant for MS codepages 115 if ((kCFStringEncodingInvalidId == encoding) && !strncasecmp(name, "ms950", strlen("ms950"))) { // <rdar://problem/12903398> “MS950” is not recognized 116 encoding = __CFStringEncodingGetFromCanonicalName("cp950"); 117 } 118 119 return encoding; 120} 121 122CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) { 123 CFStringRef name = NULL; 124 CFIndex value = encoding; 125 static CFMutableDictionaryRef mappingTable = NULL; 126 static CFSpinLock_t lock = CFSpinLockInit; 127 128 __CFSpinLock(&lock); 129 name = ((NULL == mappingTable) ? NULL : (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)value)); 130 131 if (NULL == name) { 132#define STACK_BUFFER_SIZE (100) 133 char buffer[STACK_BUFFER_SIZE]; 134 135 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) name = CFStringCreateWithCString(NULL, buffer, kCFStringEncodingASCII); 136 137 138 if (NULL != name) { 139 CFIndex value = encoding; 140 141 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks); 142 143 CFDictionaryAddValue(mappingTable, (const void*)value, (const void*)name); 144 CFRelease(name); 145 } 146 } 147 __CFSpinUnlock(&lock); 148 149 return name; 150} 151 152enum { 153 NSASCIIStringEncoding = 1, /* 0..127 only */ 154 NSNEXTSTEPStringEncoding = 2, 155 NSJapaneseEUCStringEncoding = 3, 156 NSUTF8StringEncoding = 4, 157 NSISOLatin1StringEncoding = 5, 158 NSSymbolStringEncoding = 6, 159 NSNonLossyASCIIStringEncoding = 7, 160 NSShiftJISStringEncoding = 8, 161 NSISOLatin2StringEncoding = 9, 162 NSUnicodeStringEncoding = 10, 163 NSWindowsCP1251StringEncoding = 11, /* Cyrillic; same as AdobeStandardCyrillic */ 164 NSWindowsCP1252StringEncoding = 12, /* WinLatin1 */ 165 NSWindowsCP1253StringEncoding = 13, /* Greek */ 166 NSWindowsCP1254StringEncoding = 14, /* Turkish */ 167 NSWindowsCP1250StringEncoding = 15, /* WinLatin2 */ 168 NSISO2022JPStringEncoding = 21, /* ISO 2022 Japanese encoding for e-mail */ 169 NSMacOSRomanStringEncoding = 30, 170 171 NSProprietaryStringEncoding = 65536 /* Installation-specific encoding */ 172}; 173 174#define NSENCODING_MASK (1 << 31) 175 176unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) { 177 switch (theEncoding & 0xFFF) { 178 case kCFStringEncodingUnicode: 179 if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding; 180 else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding; 181 break; 182 183 case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding; 184 case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding; 185 186 case kCFStringEncodingASCII: return NSASCIIStringEncoding; 187 188 case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding; 189 case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding; 190 case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding; 191 case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding; 192 case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding; 193 case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding; 194 195 case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding; 196 case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding; 197 case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding; 198 case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding; 199 case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding; 200 case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding; 201 } 202 203 return NSENCODING_MASK | theEncoding; 204} 205 206CFStringEncoding CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding) { 207 const uint16_t encodings[] = { 208 kCFStringEncodingASCII, 209 kCFStringEncodingNextStepLatin, 210 kCFStringEncodingEUC_JP, 211 0, 212 kCFStringEncodingISOLatin1, 213 kCFStringEncodingMacSymbol, 214 kCFStringEncodingNonLossyASCII, 215 kCFStringEncodingDOSJapanese, 216 kCFStringEncodingISOLatin2, 217 kCFStringEncodingUTF16, 218 kCFStringEncodingWindowsCyrillic, 219 kCFStringEncodingWindowsLatin1, 220 kCFStringEncodingWindowsGreek, 221 kCFStringEncodingWindowsLatin5, 222 kCFStringEncodingWindowsLatin2 223 }; 224 225 if (NSUTF8StringEncoding == theEncoding) return kCFStringEncodingUTF8; 226 227 if ((theEncoding > 0) && (theEncoding <= NSWindowsCP1250StringEncoding)) return encodings[theEncoding - 1]; 228 229 switch (theEncoding) { 230 case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman; 231 case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP; 232 233 default: 234 return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId); 235 } 236} 237 238UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) { 239 uint16_t codepage = __CFStringEncodingGetWindowsCodePage(theEncoding); 240 241 return ((0 == codepage) ? kCFStringEncodingInvalidId : codepage); 242} 243 244CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) { 245 return __CFStringEncodingGetFromWindowsCodePage(theEncoding); 246} 247 248CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) { 249 CFStringEncoding macEncoding = __CFStringEncodingGetMostCompatibleMacScript(encoding); 250 251 252 return macEncoding; 253} 254 255#define kCFStringCompareAllocationIncrement (128) 256 257#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 258 259// ------------------------------------------------------------------------------------------------- 260// CompareSpecials - ignore case & diacritic differences 261// 262// Decomposed have 2nd-4th chars of type Mn or Mc, or in range 1160-11FF (jamo) 263// Fullwidth & halfwidth are in range FF00-FFEF 264// Parenthesized & circled are in range 3200-32FF 265// ------------------------------------------------------------------------------------------------- 266 267enum { 268 kUpperCaseWeightMin = 0x80 | 0x0F, 269 kUpperCaseWeightMax = 0x80 | 0x17, 270 kUpperToLowerDelta = 0x80 | 0x0A, // 0x0A = 0x0F - 0x05 271 kMaskPrimarySecondary = 0xFFFFFF00, 272 kMaskPrimaryOnly = 0xFFFF0000, 273 kMaskSecondaryOnly = 0x0000FF00, 274 kMaskCaseTertiary = 0x000000FF // 2 hi bits case, 6 lo bits tertiary 275}; 276 277static SInt32 __CompareSpecials(const UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length) { 278 UErrorCode icuStatus = U_ZERO_ERROR; 279 SInt32 orderWidth = 0; 280 SInt32 orderCompos = 0; 281 282 UCollationElements * collElems1 = ucol_openElements(collator, (const UChar *)text1Ptr, text1Length, &icuStatus); 283 UCollationElements * collElems2 = ucol_openElements(collator, (const UChar *)text2Ptr, text2Length, &icuStatus); 284 if (U_SUCCESS(icuStatus)) { 285 int32_t startOffset1 = 0; 286 int32_t startOffset2 = 0; 287 288 while (true) { 289 int32_t elemOrder1, elemOrder2; 290 int32_t offset1, offset2; 291 292 elemOrder1 = ucol_next(collElems1, &icuStatus); 293 elemOrder2 = ucol_next(collElems2, &icuStatus); 294 if ( U_FAILURE(icuStatus) || elemOrder1 == (int32_t)UCOL_NULLORDER || elemOrder2 == (int32_t)UCOL_NULLORDER ) { 295 break; 296 } 297 298 offset1 = ucol_getOffset(collElems1); 299 offset2 = ucol_getOffset(collElems2); 300 if ( (elemOrder1 & kMaskPrimarySecondary) == (elemOrder2 & kMaskPrimarySecondary) ) { 301 if ( (elemOrder1 & kMaskPrimaryOnly) != 0 ) { 302 // keys may differ in case, width, circling, etc. 303 304 int32_t tertiary1 = (elemOrder1 & kMaskCaseTertiary); 305 int32_t tertiary2 = (elemOrder2 & kMaskCaseTertiary); 306 // fold upper to lower case 307 if (tertiary1 >= kUpperCaseWeightMin && tertiary1 <= kUpperCaseWeightMax) { 308 tertiary1 -= kUpperToLowerDelta; 309 } 310 if (tertiary2 >= kUpperCaseWeightMin && tertiary2 <= kUpperCaseWeightMax) { 311 tertiary2 -= kUpperToLowerDelta; 312 } 313 // now compare 314 if (tertiary1 != tertiary2) { 315 orderWidth = (tertiary1 < tertiary2)? -1: 1; 316 break; 317 } 318 319 } else if ( (elemOrder1 & kMaskSecondaryOnly) != 0 ) { 320 // primary weights are both zero, but secondaries are not. 321 if ( orderCompos == 0 && (options & kCFCompareNonliteral) == 0 ) { 322 // We have a code element which is a diacritic. 323 // It may have come from a composed char or a combining char. 324 // If it came from a combining char (longer element length) it sorts first. 325 // This is only an approximation to what the Mac OS 9 code did, but this is an 326 // unusual case anyway. 327 int32_t elem1Length = offset1 - startOffset1; 328 int32_t elem2Length = offset2 - startOffset2; 329 if (elem1Length != elem2Length) { 330 orderCompos = (elem1Length > elem2Length)? -1: 1; 331 } 332 } 333 } 334 } 335 336 startOffset1 = offset1; 337 startOffset2 = offset2; 338 } 339 ucol_closeElements(collElems1); 340 ucol_closeElements(collElems2); 341 } 342 343 return (orderWidth != 0)? orderWidth: orderCompos; 344} 345 346static SInt32 __CompareCodePoints(const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length ) { 347 const UniChar * text1P = text1Ptr; 348 const UniChar * text2P = text2Ptr; 349 UInt32 textLimit = (text1Length <= text2Length)? text1Length: text2Length; 350 UInt32 textCounter; 351 SInt32 orderResult = 0; 352 353 // Loop through either string...the first difference differentiates this. 354 for (textCounter = 0; textCounter < textLimit && *text1P == *text2P; textCounter++) { 355 text1P++; 356 text2P++; 357 } 358 if (textCounter < textLimit) { 359 // code point difference 360 orderResult = (*text1P < *text2P) ? -1 : 1; 361 } else if (text1Length != text2Length) { 362 // one string has extra stuff at end 363 orderResult = (text1Length < text2Length) ? -1 : 1; 364 } 365 return orderResult; 366} 367 368 369extern const CFStringRef __kCFLocaleCollatorID; 370 371static UCollator *__CFStringCreateCollator(CFLocaleRef compareLocale) { 372 CFStringRef canonLocaleCFStr = (CFStringRef)CFLocaleGetValue(compareLocale, __kCFLocaleCollatorID); 373 char icuLocaleStr[128] = {0}; 374 CFStringGetCString(canonLocaleCFStr, icuLocaleStr, sizeof(icuLocaleStr), kCFStringEncodingASCII); 375 UErrorCode icuStatus = U_ZERO_ERROR; 376 UCollator * collator = ucol_open(icuLocaleStr, &icuStatus); 377 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 378 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, UCOL_NON_IGNORABLE, &icuStatus); 379 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 380 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 381 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 382 return collator; 383} 384 385#define kCFMaxCachedDefaultCollators (8) 386static UCollator *__CFDefaultCollators[kCFMaxCachedDefaultCollators]; 387static CFIndex __CFDefaultCollatorsCount = 0; 388static const void *__CFDefaultCollatorLocale = NULL; 389static CFSpinLock_t __CFDefaultCollatorLock = CFSpinLockInit; 390 391static UCollator *__CFStringCopyDefaultCollator(CFLocaleRef compareLocale) { 392 CFLocaleRef currentLocale = NULL; 393 UCollator * collator = NULL; 394 395 if (compareLocale != __CFDefaultCollatorLocale) { 396 currentLocale = CFLocaleCopyCurrent(); 397 if (compareLocale != currentLocale) { 398 CFRelease(currentLocale); 399 return NULL; 400 } 401 } 402 403 __CFSpinLock(&__CFDefaultCollatorLock); 404 if ((NULL != currentLocale) && (__CFDefaultCollatorLocale != currentLocale)) { 405 while (__CFDefaultCollatorsCount > 0) ucol_close(__CFDefaultCollators[--__CFDefaultCollatorsCount]); 406 __CFDefaultCollatorLocale = CFRetain(currentLocale); 407 } 408 409 if (__CFDefaultCollatorsCount > 0) collator = __CFDefaultCollators[--__CFDefaultCollatorsCount]; 410 __CFSpinUnlock(&__CFDefaultCollatorLock); 411 412 if (NULL == collator) { 413 collator = __CFStringCreateCollator(compareLocale); 414 } 415 416 if (NULL != currentLocale) CFRelease(currentLocale); 417 418 return collator; 419} 420 421#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 422static void __collatorFinalize(UCollator *collator) { 423 CFLocaleRef locale = _CFGetTSD(__CFTSDKeyCollatorLocale); 424 _CFSetTSD(__CFTSDKeyCollatorUCollator, NULL, NULL); 425 _CFSetTSD(__CFTSDKeyCollatorLocale, NULL, NULL); 426 __CFSpinLock(&__CFDefaultCollatorLock); 427 if ((__CFDefaultCollatorLocale == locale) && (__CFDefaultCollatorsCount < kCFMaxCachedDefaultCollators)) { 428 __CFDefaultCollators[__CFDefaultCollatorsCount++] = collator; 429 collator = NULL; 430 } 431 __CFSpinUnlock(&__CFDefaultCollatorLock); 432 if (NULL != collator) ucol_close(collator); 433 if (locale) CFRelease(locale); 434} 435#endif 436 437// ------------------------------------------------------------------------------------------------- 438// __CompareTextDefault 439// 440// A primary difference is denoted by values 2/-2 in orderP. Other differences are indicated with a -1/1. 441// A negative value indicates that text1 sorts before text2. 442// ------------------------------------------------------------------------------------------------- 443static OSStatus __CompareTextDefault(UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length, Boolean *equivalentP, SInt32 *orderP) { 444 445 // collator must have default settings restored on exit from this function 446 447 *equivalentP = true; 448 *orderP = 0; 449 450 if (options & kCFCompareNumerically) { 451 UErrorCode icuStatus = U_ZERO_ERROR; 452 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_ON, &icuStatus); 453 } 454 455 // Most string differences are Primary. Do a primary check first, then if there 456 // are no differences do a comparison with the options in the collator. 457 UCollationResult icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 458 if (icuResult != UCOL_EQUAL) { 459 *orderP = (icuResult == UCOL_LESS) ? -2 : 2; 460 } 461 if (*orderP == 0) { 462 UErrorCode icuStatus = U_ZERO_ERROR; 463 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus); 464 ucol_setAttribute(collator, UCOL_STRENGTH, (options & kCFCompareDiacriticInsensitive) ? UCOL_PRIMARY : UCOL_SECONDARY, &icuStatus); 465 ucol_setAttribute(collator, UCOL_CASE_LEVEL, (options & kCFCompareCaseInsensitive) ? UCOL_OFF : UCOL_ON, &icuStatus); 466 if (!U_SUCCESS(icuStatus)) { 467 icuStatus = U_ZERO_ERROR; 468 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 469 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 470 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 471 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 472 return 666; 473 } 474 475 // We don't have a primary difference. Recompare with standard collator. 476 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 477 if (icuResult != UCOL_EQUAL) { 478 *orderP = (icuResult == UCOL_LESS) ? -1 : 1; 479 } 480 icuStatus = U_ZERO_ERROR; 481 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 482 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 483 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 484 } 485 if (*orderP == 0 && (options & kCFCompareNonliteral) == 0) { 486 *orderP = __CompareSpecials(collator, options, text1Ptr, text1Length, text2Ptr, text2Length); 487 } 488 489 *equivalentP = (*orderP == 0); 490 491 // If strings are equivalent but we care about order and have not yet checked 492 // to the level of code point order, then do some more checks for order 493 if (*orderP == 0) { 494 UErrorCode icuStatus = U_ZERO_ERROR; 495 // First try to see if ICU can find any differences above code point level 496 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus); 497 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_TERTIARY, &icuStatus); 498 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_ON, &icuStatus); 499 if (!U_SUCCESS(icuStatus)) { 500 icuStatus = U_ZERO_ERROR; 501 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 502 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 503 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 504 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 505 return 666; 506 } 507 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 508 if (icuResult != UCOL_EQUAL) { 509 *orderP = (icuResult == UCOL_LESS) ? -1 : 1; 510 } else { 511 // no ICU differences above code point level, compare code points 512 *orderP = __CompareCodePoints( text1Ptr, text1Length, text2Ptr, text2Length ); 513 } 514 icuStatus = U_ZERO_ERROR; 515 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 516 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 517 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 518 } 519 520 if (options & kCFCompareNumerically) { 521 UErrorCode icuStatus = U_ZERO_ERROR; 522 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 523 } 524 return 0; // noErr 525} 526 527#endif // DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 528 529static inline CFIndex __extendLocationBackward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *nonBaseBMP, const uint8_t *punctBMP) { 530 while (location > 0) { 531 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location); 532 UTF32Char otherChar; 533 if (CFUniCharIsSurrogateLowCharacter(ch) && CFUniCharIsSurrogateHighCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location - 1)))) { 534 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar); 535 uint8_t planeNo = (ch >> 16); 536 if ((planeNo > 1) || (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)))) break; 537 location -= 2; 538 } else { 539 if ((!CFUniCharIsMemberOfBitmap(ch, nonBaseBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break; 540 --location; 541 } 542 } 543 544 return location; 545} 546 547static inline CFIndex __extendLocationForward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *alnumBMP, const uint8_t *punctBMP, const uint8_t *controlBMP, CFIndex strMax) { 548 do { 549 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location); 550 UTF32Char otherChar; 551 if (CFUniCharIsSurrogateHighCharacter(ch) && CFUniCharIsSurrogateLowCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location + 1)))) { 552 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar); 553 location += 2; 554 uint8_t planeNo = (ch >> 16); 555 if (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, planeNo))) break; 556 } else { 557 ++location; 558 if ((!CFUniCharIsMemberOfBitmap(ch, alnumBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP) && !CFUniCharIsMemberOfBitmap(ch, controlBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break; 559 } 560 } while (location < strMax); 561 return location; 562} 563 564CF_PRIVATE CFComparisonResult _CFCompareStringsWithLocale(CFStringInlineBuffer *str1, CFRange str1Range, CFStringInlineBuffer *str2, CFRange str2Range, CFOptionFlags options, const void *compareLocale) { 565 const UniChar *characters1; 566 const UniChar *characters2; 567 CFComparisonResult compResult = kCFCompareEqualTo; 568 CFRange range1 = str1Range; 569 CFRange range2 = str2Range; 570 SInt32 order; 571#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 572 Boolean isEqual; 573 bool forcedOrdering = ((options & kCFCompareForcedOrdering) ? true : false); 574 575 UCollator *collator = NULL; 576 bool defaultCollator = true; 577#endif 578 static const uint8_t *alnumBMP = NULL; 579 static const uint8_t *nonBaseBMP = NULL; 580 static const uint8_t *punctBMP = NULL; 581 static const uint8_t *controlBMP = NULL; 582 583 if (NULL == alnumBMP) { 584 alnumBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, 0); 585 nonBaseBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); 586 punctBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, 0); 587 controlBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, 0); 588 } 589 590 // Determine the range of characters surrodiing the current index significant for localized comparison. The range is extended backward and forward as long as they are contextual. Contextual characters include all letters and punctuations. Since most control/format characters are ignorable in localized comparison, we also include them extending forward. 591 592 range1.location = str1Range.location; 593 range2.location = str2Range.location; 594 595 // go backward 596 // The characters upto the current index are already determined to be equal by the CFString's standard character folding algorithm. Extend as long as truly contextual (all letters and punctuations). 597 if (range1.location > 0) { 598 range1.location = __extendLocationBackward(range1.location - 1, str1, nonBaseBMP, punctBMP); 599 } 600 601 if (range2.location > 0) { 602 range2.location = __extendLocationBackward(range2.location - 1, str2, nonBaseBMP, punctBMP); 603 } 604 605#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 606#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 607 // First we try to use the last one used on this thread, if the locale is the same, 608 // otherwise we try to check out a default one, or then we create one. 609 UCollator *threadCollator = _CFGetTSD(__CFTSDKeyCollatorUCollator); 610 CFLocaleRef threadLocale = _CFGetTSD(__CFTSDKeyCollatorLocale); 611 if (compareLocale == threadLocale) { 612 collator = threadCollator; 613 } else { 614#endif 615 collator = __CFStringCopyDefaultCollator((CFLocaleRef)compareLocale); 616 defaultCollator = true; 617 if (NULL == collator) { 618 collator = __CFStringCreateCollator((CFLocaleRef)compareLocale); 619 defaultCollator = false; 620 } 621#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 622 } 623#endif 624#endif 625 626 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1); 627 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2); 628 629 if ((NULL != characters1) && (NULL != characters2)) { // do fast 630 range1.length = (str1Range.location + str1Range.length) - range1.location; 631 range2.length = (str2Range.location + str2Range.length) - range2.location; 632 633#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 634 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) { 635 compResult = ((isEqual && !forcedOrdering) ? kCFCompareEqualTo : ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan)); 636 } else 637#endif 638 { 639 compResult = ((memcmp(characters1, characters2, sizeof(UniChar) * range1.length) < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 640 } 641 } else { 642 UniChar *buffer1 = NULL; 643 UniChar *buffer2 = NULL; 644 UTF16Char sBuffer1[kCFStringCompareAllocationIncrement]; 645 UTF16Char sBuffer2[kCFStringCompareAllocationIncrement]; 646 CFIndex buffer1Len = 0, buffer2Len = 0; 647 CFIndex str1Max = str1Range.location + str1Range.length; 648 CFIndex str2Max = str2Range.location + str2Range.length; 649 CFIndex bufferSize; 650 651 // Extend forward and compare until the result is deterministic. The result is indeterministic if the differences are weak and can be resolved by character folding. For example, comparision between "abc" and "ABC" is considered to be indeterministic. 652 do { 653 if (str1Range.location < str1Max) { 654 str1Range.location = __extendLocationForward(str1Range.location, str1, alnumBMP, punctBMP, controlBMP, str1Max); 655 range1.length = (str1Range.location - range1.location); 656 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1); 657 658 if (NULL == characters1) { 659 if ((0 > buffer1Len) || (range1.length > kCFStringCompareAllocationIncrement)) { 660 if (buffer1Len < range1.length) { 661 bufferSize = range1.length + (kCFStringCompareAllocationIncrement - (range1.length % kCFStringCompareAllocationIncrement)); 662 if (0 == buffer1Len) { 663 buffer1 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0); 664 } else if (buffer1Len < range1.length) { 665 buffer1 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer1, sizeof(UTF16Char) * bufferSize, 0); 666 } 667 buffer1Len = bufferSize; 668 } 669 } else { 670 buffer1 = sBuffer1; 671 } 672 673 CFStringGetCharactersFromInlineBuffer(str1, range1, buffer1); 674 characters1 = buffer1; 675 } 676 } 677 678 if (str2Range.location < str2Max) { 679 str2Range.location = __extendLocationForward(str2Range.location, str2, alnumBMP, punctBMP, controlBMP, str2Max); 680 range2.length = (str2Range.location - range2.location); 681 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2); 682 683 if (NULL == characters2) { 684 if ((0 > buffer2Len) || (range2.length > kCFStringCompareAllocationIncrement)) { 685 if (buffer2Len < range2.length) { 686 bufferSize = range2.length + (kCFStringCompareAllocationIncrement - (range2.length % kCFStringCompareAllocationIncrement)); 687 if (0 == buffer2Len) { 688 buffer2 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0); 689 } else if (buffer2Len < range2.length) { 690 buffer2 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer2, sizeof(UTF16Char) * bufferSize, 0); 691 } 692 buffer2Len = bufferSize; 693 } 694 } else { 695 buffer2 = sBuffer2; 696 } 697 698 CFStringGetCharactersFromInlineBuffer(str2, range2, buffer2); 699 characters2 = buffer2; 700 } 701 } 702 703#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 704 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) { 705 if (isEqual) { 706 if (forcedOrdering && (kCFCompareEqualTo == compResult) && (0 != order)) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 707 order = 0; 708 } 709 } else 710#endif 711 { 712 order = memcmp(characters1, characters2, sizeof(UTF16Char) * ((range1.length < range2.length) ? range1.length : range2.length)); 713 if (0 == order) { 714 if (range1.length < range2.length) { 715 order = -2; 716 } else if (range2.length < range1.length) { 717 order = 2; 718 } 719 } else if (order < 0) { 720 --order; 721 } else if (order > 0) { 722 ++order; 723 } 724 } 725 726 if ((order < -1) || (order > 1)) break; // the result is deterministic 727 728 if (0 == order) { 729 range1.location = str1Range.location; 730 range2.location = str2Range.location; 731 } 732 } while ((str1Range.location < str1Max) || (str2Range.location < str2Max)); 733 734 if (0 != order) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 735 736 if (buffer1Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer1); 737 if (buffer2Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer2); 738 } 739 740#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 741 if (collator == threadCollator) { 742 // do nothing, already cached 743 } else { 744 if (threadLocale) __collatorFinalize((UCollator *)_CFGetTSD(__CFTSDKeyCollatorUCollator)); // need to dealloc collators 745 746 _CFSetTSD(__CFTSDKeyCollatorUCollator, collator, (void *)__collatorFinalize); 747 _CFSetTSD(__CFTSDKeyCollatorLocale, (void *)CFRetain(compareLocale), NULL); 748 } 749#endif 750 751 return compResult; 752} 753 754