1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFStringUtilities.c
25	Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26	Responsibility: Aki Inoue
27*/
28
29#include "CFInternal.h"
30#include <CoreFoundation/CFStringEncodingConverterExt.h>
31#include <CoreFoundation/CFUniChar.h>
32#include <CoreFoundation/CFStringEncodingExt.h>
33#include "CFStringEncodingDatabase.h"
34#include "CFICUConverters.h"
35#include <limits.h>
36#include <stdlib.h>
37#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
38#include <unicode/ucol.h>
39#include <unicode/ucoleitr.h>
40#endif
41#include <string.h>
42
43#if  DEPLOYMENT_TARGET_WINDOWS
44#include <tchar.h>
45#endif
46
47
48Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) {
49    switch (theEncoding) {
50        case kCFStringEncodingASCII: // Built-in encodings
51        case kCFStringEncodingMacRoman:
52        case kCFStringEncodingUTF8:
53        case kCFStringEncodingNonLossyASCII:
54        case kCFStringEncodingWindowsLatin1:
55        case kCFStringEncodingNextStepLatin:
56        case kCFStringEncodingUTF16:
57        case kCFStringEncodingUTF16BE:
58        case kCFStringEncodingUTF16LE:
59        case kCFStringEncodingUTF32:
60        case kCFStringEncodingUTF32BE:
61        case kCFStringEncodingUTF32LE:
62            return true;
63
64        default:
65            return CFStringEncodingIsValidEncoding(theEncoding);
66    }
67}
68
69const CFStringEncoding* CFStringGetListOfAvailableEncodings() {
70    return (const CFStringEncoding *)CFStringEncodingListOfAvailableEncodings();
71}
72
73CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) {
74    static CFMutableDictionaryRef mappingTable = NULL;
75    CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding) : NULL;
76
77    if (!theName) {
78        const char *encodingName = __CFStringEncodingGetName(theEncoding);
79
80        if (encodingName) {
81            theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, encodingName, kCFStringEncodingASCII);
82        }
83
84        if (theName) {
85            if (!mappingTable) mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
86
87            CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)theEncoding, (const void*)theName);
88            CFRelease(theName);
89        }
90    }
91
92    return theName;
93}
94
95CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) {
96    CFStringEncoding encoding = kCFStringEncodingInvalidId;
97#define BUFFER_SIZE (100)
98    char buffer[BUFFER_SIZE];
99    const char *name = CFStringGetCStringPtr(charsetName, __CFStringGetEightBitStringEncoding());
100
101    if (NULL == name) {
102        if (false == CFStringGetCString(charsetName, buffer, BUFFER_SIZE, __CFStringGetEightBitStringEncoding())) return kCFStringEncodingInvalidId;
103
104        name = buffer;
105    }
106
107    encoding = __CFStringEncodingGetFromCanonicalName(name);
108
109#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
110    if (kCFStringEncodingInvalidId == encoding) encoding = __CFStringEncodingGetFromICUName(name);
111#endif
112
113
114    // handling Java name variant for MS codepages
115    if ((kCFStringEncodingInvalidId == encoding) && !strncasecmp(name, "ms950", strlen("ms950"))) { // <rdar://problem/12903398> “MS950” is not recognized
116        encoding = __CFStringEncodingGetFromCanonicalName("cp950");
117    }
118
119    return encoding;
120}
121
122CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) {
123    CFStringRef name = NULL;
124    CFIndex value = encoding;
125    static CFMutableDictionaryRef mappingTable = NULL;
126    static CFSpinLock_t lock = CFSpinLockInit;
127
128    __CFSpinLock(&lock);
129    name = ((NULL == mappingTable) ? NULL : (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)value));
130
131    if (NULL == name) {
132#define STACK_BUFFER_SIZE (100)
133        char buffer[STACK_BUFFER_SIZE];
134
135        if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) name = CFStringCreateWithCString(NULL, buffer, kCFStringEncodingASCII);
136
137
138        if (NULL != name) {
139            CFIndex value = encoding;
140
141            if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks);
142
143            CFDictionaryAddValue(mappingTable, (const void*)value, (const void*)name);
144            CFRelease(name);
145        }
146    }
147    __CFSpinUnlock(&lock);
148
149    return name;
150}
151
152enum {
153    NSASCIIStringEncoding = 1,		/* 0..127 only */
154    NSNEXTSTEPStringEncoding = 2,
155    NSJapaneseEUCStringEncoding = 3,
156    NSUTF8StringEncoding = 4,
157    NSISOLatin1StringEncoding = 5,
158    NSSymbolStringEncoding = 6,
159    NSNonLossyASCIIStringEncoding = 7,
160    NSShiftJISStringEncoding = 8,
161    NSISOLatin2StringEncoding = 9,
162    NSUnicodeStringEncoding = 10,
163    NSWindowsCP1251StringEncoding = 11,    /* Cyrillic; same as AdobeStandardCyrillic */
164    NSWindowsCP1252StringEncoding = 12,    /* WinLatin1 */
165    NSWindowsCP1253StringEncoding = 13,    /* Greek */
166    NSWindowsCP1254StringEncoding = 14,    /* Turkish */
167    NSWindowsCP1250StringEncoding = 15,    /* WinLatin2 */
168    NSISO2022JPStringEncoding = 21,         /* ISO 2022 Japanese encoding for e-mail */
169    NSMacOSRomanStringEncoding = 30,
170
171    NSProprietaryStringEncoding = 65536    /* Installation-specific encoding */
172};
173
174#define NSENCODING_MASK (1 << 31)
175
176unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) {
177    switch (theEncoding & 0xFFF) {
178        case kCFStringEncodingUnicode:
179            if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding;
180            else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding;
181            break;
182
183        case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding;
184        case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding;
185
186        case kCFStringEncodingASCII: return NSASCIIStringEncoding;
187
188        case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding;
189        case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding;
190        case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding;
191        case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding;
192        case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding;
193        case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding;
194
195        case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding;
196        case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding;
197        case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding;
198        case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding;
199        case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding;
200        case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding;
201    }
202
203    return NSENCODING_MASK | theEncoding;
204}
205
206CFStringEncoding CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding) {
207    const uint16_t encodings[] = {
208        kCFStringEncodingASCII,
209        kCFStringEncodingNextStepLatin,
210        kCFStringEncodingEUC_JP,
211        0,
212        kCFStringEncodingISOLatin1,
213        kCFStringEncodingMacSymbol,
214        kCFStringEncodingNonLossyASCII,
215        kCFStringEncodingDOSJapanese,
216        kCFStringEncodingISOLatin2,
217        kCFStringEncodingUTF16,
218        kCFStringEncodingWindowsCyrillic,
219        kCFStringEncodingWindowsLatin1,
220        kCFStringEncodingWindowsGreek,
221        kCFStringEncodingWindowsLatin5,
222        kCFStringEncodingWindowsLatin2
223    };
224
225    if (NSUTF8StringEncoding == theEncoding) return kCFStringEncodingUTF8;
226
227    if ((theEncoding > 0) && (theEncoding <= NSWindowsCP1250StringEncoding)) return encodings[theEncoding - 1];
228
229    switch (theEncoding) {
230        case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman;
231        case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP;
232
233        default:
234            return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId);
235    }
236}
237
238UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) {
239    uint16_t codepage = __CFStringEncodingGetWindowsCodePage(theEncoding);
240
241    return ((0 == codepage) ? kCFStringEncodingInvalidId : codepage);
242}
243
244CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) {
245    return __CFStringEncodingGetFromWindowsCodePage(theEncoding);
246}
247
248CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) {
249    CFStringEncoding macEncoding = __CFStringEncodingGetMostCompatibleMacScript(encoding);
250
251
252    return macEncoding;
253}
254
255#define kCFStringCompareAllocationIncrement (128)
256
257#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
258
259// -------------------------------------------------------------------------------------------------
260//	CompareSpecials - ignore case & diacritic differences
261//
262//	Decomposed have 2nd-4th chars of type Mn or Mc, or in range 1160-11FF (jamo)
263//	Fullwidth & halfwidth are in range FF00-FFEF
264//	Parenthesized & circled are in range 3200-32FF
265// -------------------------------------------------------------------------------------------------
266
267enum {
268	kUpperCaseWeightMin	= 0x80 | 0x0F,
269	kUpperCaseWeightMax	= 0x80 | 0x17,
270	kUpperToLowerDelta	= 0x80 | 0x0A,	// 0x0A = 0x0F - 0x05
271	kMaskPrimarySecondary	= 0xFFFFFF00,
272	kMaskPrimaryOnly	= 0xFFFF0000,
273	kMaskSecondaryOnly	= 0x0000FF00,
274	kMaskCaseTertiary	= 0x000000FF	// 2 hi bits case, 6 lo bits tertiary
275};
276
277static SInt32 __CompareSpecials(const UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length) {
278	UErrorCode icuStatus = U_ZERO_ERROR;
279	SInt32	orderWidth = 0;
280	SInt32	orderCompos = 0;
281
282	UCollationElements * collElems1 = ucol_openElements(collator, (const UChar *)text1Ptr, text1Length, &icuStatus);
283	UCollationElements * collElems2 = ucol_openElements(collator, (const UChar *)text2Ptr, text2Length, &icuStatus);
284	if (U_SUCCESS(icuStatus)) {
285		int32_t	startOffset1 = 0;
286		int32_t	startOffset2 = 0;
287
288		while (true) {
289			int32_t	elemOrder1, elemOrder2;
290			int32_t	offset1, offset2;
291
292			elemOrder1 = ucol_next(collElems1, &icuStatus);
293			elemOrder2 = ucol_next(collElems2, &icuStatus);
294			if ( U_FAILURE(icuStatus) || elemOrder1 == (int32_t)UCOL_NULLORDER || elemOrder2 == (int32_t)UCOL_NULLORDER ) {
295				break;
296			}
297
298			offset1 = ucol_getOffset(collElems1);
299			offset2 = ucol_getOffset(collElems2);
300			if ( (elemOrder1 & kMaskPrimarySecondary) == (elemOrder2 & kMaskPrimarySecondary) ) {
301				if ( (elemOrder1 & kMaskPrimaryOnly) != 0 ) {
302					// keys may differ in case, width, circling, etc.
303
304					int32_t	tertiary1 = (elemOrder1 & kMaskCaseTertiary);
305					int32_t tertiary2 = (elemOrder2 & kMaskCaseTertiary);
306					// fold upper to lower case
307					if (tertiary1 >= kUpperCaseWeightMin && tertiary1 <= kUpperCaseWeightMax) {
308						tertiary1 -= kUpperToLowerDelta;
309					}
310					if (tertiary2 >= kUpperCaseWeightMin && tertiary2 <= kUpperCaseWeightMax) {
311						tertiary2 -= kUpperToLowerDelta;
312					}
313					// now compare
314					if (tertiary1 != tertiary2) {
315						orderWidth = (tertiary1 < tertiary2)? -1: 1;
316						break;
317					}
318
319				} else if ( (elemOrder1 & kMaskSecondaryOnly) != 0 ) {
320					// primary weights are both zero, but secondaries are not.
321					if ( orderCompos == 0 && (options & kCFCompareNonliteral) == 0 ) {
322						// We have a code element which is a diacritic.
323						// It may have come from a composed char or a combining char.
324						// If it came from a combining char (longer element length) it sorts first.
325						// This is only an approximation to what the Mac OS 9 code did, but this is an
326						// unusual case anyway.
327						int32_t	elem1Length = offset1 - startOffset1;
328						int32_t	elem2Length = offset2 - startOffset2;
329						if (elem1Length != elem2Length) {
330							orderCompos = (elem1Length > elem2Length)? -1: 1;
331						}
332					}
333				}
334			}
335
336			startOffset1 = offset1;
337			startOffset2 = offset2;
338		}
339		ucol_closeElements(collElems1);
340		ucol_closeElements(collElems2);
341	}
342
343	return (orderWidth != 0)? orderWidth: orderCompos;
344}
345
346static SInt32 __CompareCodePoints(const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length ) {
347	const UniChar *	text1P = text1Ptr;
348	const UniChar *	text2P = text2Ptr;
349	UInt32		textLimit = (text1Length <= text2Length)? text1Length: text2Length;
350	UInt32		textCounter;
351	SInt32		orderResult = 0;
352
353	// Loop through either string...the first difference differentiates this.
354	for (textCounter = 0; textCounter < textLimit && *text1P == *text2P; textCounter++) {
355		text1P++;
356		text2P++;
357	}
358	if (textCounter < textLimit) {
359		// code point difference
360		orderResult = (*text1P < *text2P) ? -1 : 1;
361	} else if (text1Length != text2Length) {
362		// one string has extra stuff at end
363		orderResult = (text1Length < text2Length) ? -1 : 1;
364	}
365	return orderResult;
366}
367
368
369extern const CFStringRef __kCFLocaleCollatorID;
370
371static UCollator *__CFStringCreateCollator(CFLocaleRef compareLocale) {
372    CFStringRef canonLocaleCFStr = (CFStringRef)CFLocaleGetValue(compareLocale, __kCFLocaleCollatorID);
373    char icuLocaleStr[128] = {0};
374    CFStringGetCString(canonLocaleCFStr, icuLocaleStr, sizeof(icuLocaleStr), kCFStringEncodingASCII);
375    UErrorCode icuStatus = U_ZERO_ERROR;
376    UCollator * collator = ucol_open(icuLocaleStr, &icuStatus);
377    ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
378    ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, UCOL_NON_IGNORABLE, &icuStatus);
379    ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
380    ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
381    ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
382    return collator;
383}
384
385#define kCFMaxCachedDefaultCollators (8)
386static UCollator *__CFDefaultCollators[kCFMaxCachedDefaultCollators];
387static CFIndex __CFDefaultCollatorsCount = 0;
388static const void *__CFDefaultCollatorLocale = NULL;
389static CFSpinLock_t __CFDefaultCollatorLock = CFSpinLockInit;
390
391static UCollator *__CFStringCopyDefaultCollator(CFLocaleRef compareLocale) {
392    CFLocaleRef currentLocale = NULL;
393    UCollator * collator = NULL;
394
395    if (compareLocale != __CFDefaultCollatorLocale) {
396        currentLocale = CFLocaleCopyCurrent();
397        if (compareLocale != currentLocale) {
398	    CFRelease(currentLocale);
399	    return NULL;
400	}
401    }
402
403    __CFSpinLock(&__CFDefaultCollatorLock);
404    if ((NULL != currentLocale) && (__CFDefaultCollatorLocale != currentLocale)) {
405        while (__CFDefaultCollatorsCount > 0) ucol_close(__CFDefaultCollators[--__CFDefaultCollatorsCount]);
406        __CFDefaultCollatorLocale = CFRetain(currentLocale);
407    }
408
409    if (__CFDefaultCollatorsCount > 0) collator = __CFDefaultCollators[--__CFDefaultCollatorsCount];
410    __CFSpinUnlock(&__CFDefaultCollatorLock);
411
412    if (NULL == collator) {
413	collator = __CFStringCreateCollator(compareLocale);
414    }
415
416    if (NULL != currentLocale) CFRelease(currentLocale);
417
418    return collator;
419}
420
421#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
422static void __collatorFinalize(UCollator *collator) {
423    CFLocaleRef locale = _CFGetTSD(__CFTSDKeyCollatorLocale);
424    _CFSetTSD(__CFTSDKeyCollatorUCollator, NULL, NULL);
425    _CFSetTSD(__CFTSDKeyCollatorLocale, NULL, NULL);
426    __CFSpinLock(&__CFDefaultCollatorLock);
427    if ((__CFDefaultCollatorLocale == locale) && (__CFDefaultCollatorsCount < kCFMaxCachedDefaultCollators)) {
428        __CFDefaultCollators[__CFDefaultCollatorsCount++] = collator;
429        collator = NULL;
430    }
431    __CFSpinUnlock(&__CFDefaultCollatorLock);
432    if (NULL != collator) ucol_close(collator);
433    if (locale) CFRelease(locale);
434}
435#endif
436
437// -------------------------------------------------------------------------------------------------
438// __CompareTextDefault
439//
440// A primary difference is denoted by values 2/-2 in orderP. Other differences are indicated with a -1/1.
441// A negative value indicates that text1 sorts before text2.
442// -------------------------------------------------------------------------------------------------
443static OSStatus __CompareTextDefault(UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length, Boolean *equivalentP, SInt32 *orderP) {
444
445	// collator must have default settings restored on exit from this function
446
447	*equivalentP = true;
448	*orderP = 0;
449
450	if (options & kCFCompareNumerically) {
451	    UErrorCode icuStatus = U_ZERO_ERROR;
452	    ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_ON, &icuStatus);
453	}
454
455	// Most string differences are Primary. Do a primary check first, then if there
456	// are no differences do a comparison with the options in the collator.
457	UCollationResult icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
458	if (icuResult != UCOL_EQUAL) {
459		*orderP = (icuResult == UCOL_LESS) ? -2 : 2;
460	}
461	if (*orderP == 0) {
462		UErrorCode icuStatus = U_ZERO_ERROR;
463                ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus);
464                ucol_setAttribute(collator, UCOL_STRENGTH, (options & kCFCompareDiacriticInsensitive) ? UCOL_PRIMARY : UCOL_SECONDARY, &icuStatus);
465                ucol_setAttribute(collator, UCOL_CASE_LEVEL, (options & kCFCompareCaseInsensitive) ? UCOL_OFF : UCOL_ON, &icuStatus);
466		if (!U_SUCCESS(icuStatus)) {
467		    icuStatus = U_ZERO_ERROR;
468		    ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
469		    ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
470		    ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
471		    ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
472		    return 666;
473		}
474
475		// We don't have a primary difference. Recompare with standard collator.
476		icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
477		if (icuResult != UCOL_EQUAL) {
478			*orderP = (icuResult == UCOL_LESS) ? -1 : 1;
479		}
480		icuStatus = U_ZERO_ERROR;
481                ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
482		ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
483		ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
484	}
485	if (*orderP == 0 && (options & kCFCompareNonliteral) == 0) {
486		*orderP = __CompareSpecials(collator, options, text1Ptr, text1Length, text2Ptr, text2Length);
487	}
488
489	*equivalentP = (*orderP == 0);
490
491	// If strings are equivalent but we care about order and have not yet checked
492	// to the level of code point order, then do some more checks for order
493	if (*orderP == 0) {
494		UErrorCode icuStatus = U_ZERO_ERROR;
495		// First try to see if ICU can find any differences above code point level
496                ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus);
497		ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_TERTIARY, &icuStatus);
498		ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_ON, &icuStatus);
499		if (!U_SUCCESS(icuStatus)) {
500		    icuStatus = U_ZERO_ERROR;
501		    ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
502		    ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
503		    ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
504		    ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
505		    return 666;
506		}
507		icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length);
508		if (icuResult != UCOL_EQUAL) {
509			*orderP = (icuResult == UCOL_LESS) ? -1 : 1;
510		} else {
511			// no ICU differences above code point level, compare code points
512			*orderP = __CompareCodePoints( text1Ptr, text1Length, text2Ptr, text2Length );
513		}
514		icuStatus = U_ZERO_ERROR;
515                ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus);
516		ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus);
517		ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus);
518	}
519
520	if (options & kCFCompareNumerically) {
521	    UErrorCode icuStatus = U_ZERO_ERROR;
522	    ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus);
523	}
524	return 0; // noErr
525}
526
527#endif // DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
528
529static inline CFIndex __extendLocationBackward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *nonBaseBMP, const uint8_t *punctBMP) {
530    while (location > 0) {
531        UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location);
532        UTF32Char otherChar;
533        if (CFUniCharIsSurrogateLowCharacter(ch) && CFUniCharIsSurrogateHighCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location - 1)))) {
534            ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar);
535            uint8_t planeNo = (ch >> 16);
536            if ((planeNo > 1) || (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)))) break;
537            location -= 2;
538        } else {
539            if ((!CFUniCharIsMemberOfBitmap(ch, nonBaseBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break;
540            --location;
541        }
542    }
543
544    return location;
545}
546
547static inline CFIndex __extendLocationForward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *alnumBMP, const uint8_t *punctBMP, const uint8_t *controlBMP, CFIndex strMax) {
548    do {
549        UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location);
550        UTF32Char otherChar;
551        if (CFUniCharIsSurrogateHighCharacter(ch) && CFUniCharIsSurrogateLowCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location + 1)))) {
552            ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar);
553            location += 2;
554            uint8_t planeNo = (ch >> 16);
555            if (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, planeNo))) break;
556        } else {
557            ++location;
558            if ((!CFUniCharIsMemberOfBitmap(ch, alnumBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP) && !CFUniCharIsMemberOfBitmap(ch, controlBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break;
559        }
560    } while (location < strMax);
561    return location;
562}
563
564CF_PRIVATE CFComparisonResult _CFCompareStringsWithLocale(CFStringInlineBuffer *str1, CFRange str1Range, CFStringInlineBuffer *str2, CFRange str2Range, CFOptionFlags options, const void *compareLocale) {
565    const UniChar *characters1;
566    const UniChar *characters2;
567    CFComparisonResult compResult = kCFCompareEqualTo;
568    CFRange range1 = str1Range;
569    CFRange range2 = str2Range;
570    SInt32 order;
571#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
572    Boolean isEqual;
573    bool forcedOrdering = ((options & kCFCompareForcedOrdering) ? true : false);
574
575    UCollator *collator = NULL;
576    bool defaultCollator = true;
577#endif
578    static const uint8_t *alnumBMP = NULL;
579    static const uint8_t *nonBaseBMP = NULL;
580    static const uint8_t *punctBMP = NULL;
581    static const uint8_t *controlBMP = NULL;
582
583    if (NULL == alnumBMP) {
584	alnumBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, 0);
585	nonBaseBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
586	punctBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, 0);
587	controlBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, 0);
588    }
589
590    // Determine the range of characters surrodiing the current index significant for localized comparison. The range is extended backward and forward as long as they are contextual. Contextual characters include all letters and punctuations. Since most control/format characters are ignorable in localized comparison, we also include them extending forward.
591
592    range1.location = str1Range.location;
593    range2.location = str2Range.location;
594
595    // go backward
596    // The characters upto the current index are already determined to be equal by the CFString's standard character folding algorithm. Extend as long as truly contextual (all letters and punctuations).
597    if (range1.location > 0) {
598	range1.location = __extendLocationBackward(range1.location - 1, str1, nonBaseBMP, punctBMP);
599    }
600
601    if (range2.location > 0) {
602	range2.location = __extendLocationBackward(range2.location - 1, str2, nonBaseBMP, punctBMP);
603    }
604
605#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
606#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
607    // First we try to use the last one used on this thread, if the locale is the same,
608    // otherwise we try to check out a default one, or then we create one.
609    UCollator *threadCollator = _CFGetTSD(__CFTSDKeyCollatorUCollator);
610    CFLocaleRef threadLocale = _CFGetTSD(__CFTSDKeyCollatorLocale);
611    if (compareLocale == threadLocale) {
612	collator = threadCollator;
613    } else {
614#endif
615	collator = __CFStringCopyDefaultCollator((CFLocaleRef)compareLocale);
616	defaultCollator = true;
617	if (NULL == collator) {
618	    collator = __CFStringCreateCollator((CFLocaleRef)compareLocale);
619	    defaultCollator = false;
620	}
621#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
622    }
623#endif
624#endif
625
626    characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1);
627    characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2);
628
629    if ((NULL != characters1) && (NULL != characters2)) { // do fast
630	range1.length = (str1Range.location + str1Range.length) - range1.location;
631	range2.length = (str2Range.location + str2Range.length) - range2.location;
632
633#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
634        if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) {
635            compResult = ((isEqual && !forcedOrdering) ? kCFCompareEqualTo : ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan));
636        } else
637#endif
638        {
639            compResult = ((memcmp(characters1, characters2, sizeof(UniChar) * range1.length) < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
640        }
641    } else {
642        UniChar *buffer1 = NULL;
643        UniChar *buffer2 = NULL;
644        UTF16Char sBuffer1[kCFStringCompareAllocationIncrement];
645        UTF16Char sBuffer2[kCFStringCompareAllocationIncrement];
646        CFIndex buffer1Len = 0, buffer2Len = 0;
647        CFIndex str1Max = str1Range.location + str1Range.length;
648        CFIndex str2Max = str2Range.location + str2Range.length;
649        CFIndex bufferSize;
650
651        // Extend forward and compare until the result is deterministic. The result is indeterministic if the differences are weak and can be resolved by character folding. For example, comparision between "abc" and "ABC" is considered to be indeterministic.
652        do {
653            if (str1Range.location < str1Max) {
654		str1Range.location = __extendLocationForward(str1Range.location, str1, alnumBMP, punctBMP, controlBMP, str1Max);
655                range1.length = (str1Range.location - range1.location);
656                characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1);
657
658                if (NULL == characters1) {
659                    if ((0 > buffer1Len) || (range1.length > kCFStringCompareAllocationIncrement)) {
660                        if (buffer1Len < range1.length) {
661                            bufferSize = range1.length + (kCFStringCompareAllocationIncrement - (range1.length % kCFStringCompareAllocationIncrement));
662                            if (0 == buffer1Len) {
663                                buffer1 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0);
664                            } else if (buffer1Len < range1.length) {
665                                buffer1 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer1, sizeof(UTF16Char) * bufferSize, 0);
666                            }
667                            buffer1Len = bufferSize;
668                        }
669                    } else {
670                        buffer1 = sBuffer1;
671                    }
672
673                    CFStringGetCharactersFromInlineBuffer(str1, range1, buffer1);
674                    characters1 = buffer1;
675                }
676            }
677
678            if (str2Range.location < str2Max) {
679		str2Range.location = __extendLocationForward(str2Range.location, str2, alnumBMP, punctBMP, controlBMP, str2Max);
680                range2.length = (str2Range.location - range2.location);
681                characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2);
682
683                if (NULL == characters2) {
684                    if ((0 > buffer2Len) || (range2.length > kCFStringCompareAllocationIncrement)) {
685                        if (buffer2Len < range2.length) {
686                            bufferSize = range2.length + (kCFStringCompareAllocationIncrement - (range2.length % kCFStringCompareAllocationIncrement));
687                            if (0 == buffer2Len) {
688                                buffer2 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0);
689                            } else if (buffer2Len < range2.length) {
690                                buffer2 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer2, sizeof(UTF16Char) * bufferSize, 0);
691                            }
692                            buffer2Len = bufferSize;
693                        }
694                    } else {
695                        buffer2 = sBuffer2;
696                    }
697
698                    CFStringGetCharactersFromInlineBuffer(str2, range2, buffer2);
699                    characters2 = buffer2;
700                }
701            }
702
703#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
704            if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) ==  0 /* noErr */)) {
705                if (isEqual) {
706                    if (forcedOrdering && (kCFCompareEqualTo == compResult) && (0 != order)) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
707                    order = 0;
708                }
709            } else
710#endif
711            {
712                order = memcmp(characters1, characters2, sizeof(UTF16Char) * ((range1.length < range2.length) ? range1.length : range2.length));
713                if (0 == order) {
714                    if (range1.length < range2.length) {
715                        order = -2;
716                    } else if (range2.length < range1.length) {
717                        order = 2;
718                    }
719                } else if (order < 0) {
720                    --order;
721                } else if (order > 0) {
722                    ++order;
723                }
724            }
725
726            if ((order < -1) || (order > 1)) break; // the result is deterministic
727
728            if (0 == order) {
729                range1.location = str1Range.location;
730                range2.location = str2Range.location;
731            }
732        } while ((str1Range.location < str1Max) || (str2Range.location < str2Max));
733
734        if (0 != order) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan);
735
736        if (buffer1Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer1);
737        if (buffer2Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer2);
738    }
739
740#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
741    if (collator == threadCollator) {
742	// do nothing, already cached
743    } else {
744	if (threadLocale) __collatorFinalize((UCollator *)_CFGetTSD(__CFTSDKeyCollatorUCollator)); // need to dealloc collators
745
746	_CFSetTSD(__CFTSDKeyCollatorUCollator, collator, (void *)__collatorFinalize);
747	_CFSetTSD(__CFTSDKeyCollatorLocale, (void *)CFRetain(compareLocale), NULL);
748    }
749#endif
750
751    return compResult;
752}
753
754