1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFUniChar.c 25 Copyright (c) 2001-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include <CoreFoundation/CFByteOrder.h> 30#include "CFInternal.h" 31#include "CFUniChar.h" 32#include "CFStringEncodingConverterExt.h" 33#include "CFUnicodeDecomposition.h" 34#include "CFUniCharPriv.h" 35#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_FREEBSD 36#include <fcntl.h> 37#include <sys/types.h> 38#include <sys/stat.h> 39#include <sys/param.h> 40#include <sys/mman.h> 41#include <unistd.h> 42#include <stdlib.h> 43#endif 44#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 45#include <mach/mach.h> 46#endif 47 48#if DEPLOYMENT_TARGET_WINDOWS 49extern void _CFGetFrameworkPath(wchar_t *path, int maxLength); 50#endif 51 52#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 53#define __kCFCharacterSetDir "/System/Library/CoreServices" 54#elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_FREEBSD || DEPLOYMENT_TARGET_EMBEDDED_MINI 55#define __kCFCharacterSetDir "/usr/local/share/CoreFoundation" 56#elif DEPLOYMENT_TARGET_WINDOWS 57#define __kCFCharacterSetDir "\\Windows\\CoreFoundation" 58#endif 59 60#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 61#define USE_MACHO_SEGMENT 1 62#endif 63 64enum { 65 kCFUniCharLastExternalSet = kCFUniCharNewlineCharacterSet, 66 kCFUniCharFirstInternalSet = kCFUniCharCompatibilityDecomposableCharacterSet, 67 kCFUniCharLastInternalSet = kCFUniCharGraphemeExtendCharacterSet, 68 kCFUniCharFirstBitmapSet = kCFUniCharDecimalDigitCharacterSet 69}; 70 71CF_INLINE uint32_t __CFUniCharMapExternalSetToInternalIndex(uint32_t cset) { return ((kCFUniCharFirstInternalSet <= cset) ? ((cset - kCFUniCharFirstInternalSet) + kCFUniCharLastExternalSet) : cset) - kCFUniCharFirstBitmapSet; } 72CF_INLINE uint32_t __CFUniCharMapCompatibilitySetID(uint32_t cset) { return ((cset == kCFUniCharControlCharacterSet) ? kCFUniCharControlAndFormatterCharacterSet : (((cset > kCFUniCharLastExternalSet) && (cset < kCFUniCharFirstInternalSet)) ? ((cset - kCFUniCharLastExternalSet) + kCFUniCharFirstInternalSet) : cset)); } 73 74#if (DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED) && USE_MACHO_SEGMENT 75#include <mach-o/getsect.h> 76#include <mach-o/dyld.h> 77#include <mach-o/ldsyms.h> 78 79extern const void* unicode_csbitmaps_section_start __asm("section$start$__UNICODE$__csbitmaps"); 80extern const void* unicode_csbitmaps_section_end __asm("section$end$__UNICODE$__csbitmaps"); 81extern const void* unicode_properties_section_start __asm("section$start$__UNICODE$__properties"); 82extern const void* unicode_properties_section_end __asm("section$end$__UNICODE$__properties"); 83extern const void* unicode_data_section_start __asm("section$start$__UNICODE$__data"); 84extern const void* unicode_data_section_end __asm("section$end$__UNICODE$__data"); 85 86static const void *__CFGetSectDataPtr(const char *segname, const char *sectname, uint64_t *sizep) { 87 // special case three common sections to have fast access 88 if ( strcmp(segname, "__UNICODE") == 0 ) { 89 if ( strcmp(sectname, "__csbitmaps") == 0) { 90 if (sizep) *sizep = &unicode_csbitmaps_section_end - &unicode_csbitmaps_section_start; 91 return &unicode_csbitmaps_section_start; 92 } 93 else if ( strcmp(sectname, "__properties") == 0 ) { 94 if (sizep) *sizep = &unicode_properties_section_end - &unicode_properties_section_start; 95 return &unicode_properties_section_start; 96 } 97 else if ( strcmp(sectname, "__data") == 0 ) { 98 if (sizep) *sizep = &unicode_data_section_end - &unicode_data_section_start; 99 return &unicode_data_section_start; 100 } 101 } 102 103 uint32_t idx, cnt = _dyld_image_count(); 104 for (idx = 0; idx < cnt; idx++) { 105 void *mh = (void *)_dyld_get_image_header(idx); 106 if (mh != &_mh_dylib_header) continue; 107#if __LP64__ 108 const struct section_64 *sect = getsectbynamefromheader_64((struct mach_header_64 *)mh, segname, sectname); 109#else 110 const struct section *sect = getsectbynamefromheader((struct mach_header *)mh, segname, sectname); 111#endif 112 if (!sect) break; 113 if (sizep) *sizep = (uint64_t)sect->size; 114 return (char *)sect->addr + _dyld_get_image_vmaddr_slide(idx); 115 } 116 if (sizep) *sizep = 0ULL; 117 return NULL; 118} 119#endif 120 121#if !USE_MACHO_SEGMENT 122 123// Memory map the file 124 125#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 126CF_INLINE void __CFUniCharCharacterSetPath(char *cpath) { 127#elif DEPLOYMENT_TARGET_WINDOWS 128CF_INLINE void __CFUniCharCharacterSetPath(wchar_t *wpath) { 129#else 130#error Unknown or unspecified DEPLOYMENT_TARGET 131#endif 132#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 133 strlcpy(cpath, __kCFCharacterSetDir, MAXPATHLEN); 134#elif DEPLOYMENT_TARGET_LINUX 135 strlcpy(cpath, __kCFCharacterSetDir, MAXPATHLEN); 136#elif DEPLOYMENT_TARGET_WINDOWS 137 wchar_t frameworkPath[MAXPATHLEN]; 138 _CFGetFrameworkPath(frameworkPath, MAXPATHLEN); 139 wcsncpy(wpath, frameworkPath, MAXPATHLEN); 140 wcsncat(wpath, L"\\CoreFoundation.resources\\", MAXPATHLEN - wcslen(wpath)); 141#else 142 strlcpy(cpath, __kCFCharacterSetDir, MAXPATHLEN); 143 strlcat(cpath, "/CharacterSets/", MAXPATHLEN); 144#endif 145} 146 147#if DEPLOYMENT_TARGET_WINDOWS 148#define MAX_BITMAP_STATE 512 149// 150// If a string is placed into this array, then it has been previously 151// determined that the bitmap-file cannot be found. Thus, we make 152// the assumption it won't be there in future calls and we avoid 153// hitting the disk un-necessarily. This assumption isn't 100% 154// correct, as bitmap-files can be added. We would have to re-start 155// the application in order to pick-up the new bitmap info. 156// 157// We should probably re-visit this. 158// 159static wchar_t *mappedBitmapState[MAX_BITMAP_STATE]; 160static int __nNumStateEntries = -1; 161CRITICAL_SECTION __bitmapStateLock = {0}; 162 163bool __GetBitmapStateForName(const wchar_t *bitmapName) { 164 if (NULL == __bitmapStateLock.DebugInfo) 165 InitializeCriticalSection(&__bitmapStateLock); 166 EnterCriticalSection(&__bitmapStateLock); 167 if (__nNumStateEntries >= 0) { 168 for (int i = 0; i < __nNumStateEntries; i++) { 169 if (wcscmp(mappedBitmapState[i], bitmapName) == 0) { 170 LeaveCriticalSection(&__bitmapStateLock); 171 return true; 172 } 173 } 174 } 175 LeaveCriticalSection(&__bitmapStateLock); 176 return false; 177} 178void __AddBitmapStateForName(const wchar_t *bitmapName) { 179 if (NULL == __bitmapStateLock.DebugInfo) 180 InitializeCriticalSection(&__bitmapStateLock); 181 EnterCriticalSection(&__bitmapStateLock); 182 __nNumStateEntries++; 183 mappedBitmapState[__nNumStateEntries] = (wchar_t *)malloc((lstrlenW(bitmapName)+1) * sizeof(wchar_t)); 184 lstrcpyW(mappedBitmapState[__nNumStateEntries], bitmapName); 185 LeaveCriticalSection(&__bitmapStateLock); 186} 187#endif 188 189#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 190static bool __CFUniCharLoadBytesFromFile(const char *fileName, const void **bytes, int64_t *fileSize) { 191#elif DEPLOYMENT_TARGET_WINDOWS 192static bool __CFUniCharLoadBytesFromFile(const wchar_t *fileName, const void **bytes, int64_t *fileSize) { 193#else 194#error Unknown or unspecified DEPLOYMENT_TARGET 195#endif 196#if DEPLOYMENT_TARGET_WINDOWS 197 HANDLE bitmapFileHandle = NULL; 198 HANDLE mappingHandle = NULL; 199 200 if (__GetBitmapStateForName(fileName)) { 201 // The fileName has been tried in the past, so just return false 202 // and move on. 203 *bytes = NULL; 204 return false; 205 } 206 mappingHandle = OpenFileMappingW(FILE_MAP_READ, TRUE, fileName); 207 if (NULL == mappingHandle) { 208 if ((bitmapFileHandle = CreateFileW(fileName, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) { 209 // We tried to get the bitmap file for mapping, but it's not there. Add to list of non-existant bitmap-files so 210 // we don't have to try this again in the future. 211 __AddBitmapStateForName(fileName); 212 return false; 213 } 214 mappingHandle = CreateFileMapping(bitmapFileHandle, NULL, PAGE_READONLY, 0, 0, NULL); 215 CloseHandle(bitmapFileHandle); 216 if (!mappingHandle) return false; 217 } 218 219 *bytes = MapViewOfFileEx(mappingHandle, FILE_MAP_READ, 0, 0, 0, 0); 220 221 if (NULL != fileSize) { 222 MEMORY_BASIC_INFORMATION memoryInfo; 223 224 if (0 == VirtualQueryEx(mappingHandle, *bytes, &memoryInfo, sizeof(memoryInfo))) { 225 *fileSize = 0; // This indicates no checking. Is it right ? 226 } else { 227 *fileSize = memoryInfo.RegionSize; 228 } 229 } 230 231 CloseHandle(mappingHandle); 232 233 return (*bytes ? true : false); 234#else 235 struct stat statBuf; 236 int fd = -1; 237 238 if ((fd = open(fileName, O_RDONLY, 0)) < 0) { 239 return false; 240 } 241 if (fstat(fd, &statBuf) < 0 || (*bytes = mmap(0, statBuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0)) == (void *)-1) { 242 close(fd); 243 return false; 244 } 245 close(fd); 246 247 if (NULL != fileSize) *fileSize = statBuf.st_size; 248 249 return true; 250#endif 251} 252 253#endif // USE_MACHO_SEGMENT 254 255#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 256static bool __CFUniCharLoadFile(const char *bitmapName, const void **bytes, int64_t *fileSize) { 257#elif DEPLOYMENT_TARGET_WINDOWS 258static bool __CFUniCharLoadFile(const wchar_t *bitmapName, const void **bytes, int64_t *fileSize) { 259#else 260#error Unknown or unspecified DEPLOYMENT_TARGET 261#endif 262#if USE_MACHO_SEGMENT 263 *bytes = __CFGetSectDataPtr("__UNICODE", bitmapName, NULL); 264 265 if (NULL != fileSize) *fileSize = 0; 266 267 return *bytes ? true : false; 268#else 269#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 270 char cpath[MAXPATHLEN]; 271 __CFUniCharCharacterSetPath(cpath); 272 strlcat(cpath, bitmapName, MAXPATHLEN); 273 Boolean needToFree = false; 274 const char *possiblyFrameworkRootedCPath = CFPathRelativeToAppleFrameworksRoot(cpath, &needToFree); 275 bool result = __CFUniCharLoadBytesFromFile(possiblyFrameworkRootedCPath, bytes, fileSize); 276 if (needToFree) free((void *)possiblyFrameworkRootedCPath); 277 return result; 278#elif DEPLOYMENT_TARGET_WINDOWS 279 wchar_t wpath[MAXPATHLEN]; 280 __CFUniCharCharacterSetPath(wpath); 281 wcsncat(wpath, bitmapName, MAXPATHLEN); 282 return __CFUniCharLoadBytesFromFile(wpath, bytes, fileSize); 283#else 284#error Unknown or unspecified DEPLOYMENT_TARGET 285#endif 286#endif 287} 288 289// Bitmap functions 290CF_INLINE bool isControl(UTF32Char theChar, uint16_t charset, const void *data) { // ISO Control 291 return (((theChar <= 0x001F) || (theChar >= 0x007F && theChar <= 0x009F)) ? true : false); 292} 293 294CF_INLINE bool isWhitespace(UTF32Char theChar, uint16_t charset, const void *data) { // Space 295 return (((theChar == 0x0020) || (theChar == 0x0009) || (theChar == 0x00A0) || (theChar == 0x1680) || (theChar >= 0x2000 && theChar <= 0x200B) || (theChar == 0x202F) || (theChar == 0x205F) || (theChar == 0x3000)) ? true : false); 296} 297 298CF_INLINE bool isNewline(UTF32Char theChar, uint16_t charset, const void *data) { // White space 299 return (((theChar >= 0x000A && theChar <= 0x000D) || (theChar == 0x0085) || (theChar == 0x2028) || (theChar == 0x2029)) ? true : false); 300} 301 302CF_INLINE bool isWhitespaceAndNewline(UTF32Char theChar, uint16_t charset, const void *data) { // White space 303 return ((isWhitespace(theChar, charset, data) || isNewline(theChar, charset, data)) ? true : false); 304} 305 306#if USE_MACHO_SEGMENT 307CF_INLINE bool __CFSimpleFileSizeVerification(const void *bytes, int64_t fileSize) { return true; } 308#elif 1 309// <rdar://problem/8961744> __CFSimpleFileSizeVerification is broken 310static bool __CFSimpleFileSizeVerification(const void *bytes, int64_t fileSize) { return true; } 311#else 312static bool __CFSimpleFileSizeVerification(const void *bytes, int64_t fileSize) { 313 bool result = true; 314 315 if (fileSize > 0) { 316 if ((sizeof(uint32_t) * 2) > fileSize) { 317 result = false; 318 } else { 319 uint32_t headerSize = CFSwapInt32BigToHost(*((uint32_t *)((char *)bytes + 4))); 320 321 if ((headerSize < (sizeof(uint32_t) * 4)) || (headerSize > fileSize)) { 322 result = false; 323 } else { 324 const uint32_t *lastElement = (uint32_t *)(((uint8_t *)bytes) + headerSize) - 2; 325 326 if ((headerSize + CFSwapInt32BigToHost(lastElement[0]) + CFSwapInt32BigToHost(lastElement[1])) > headerSize) result = false; 327 } 328 } 329 } 330 331 if (!result) CFLog(kCFLogLevelCritical, CFSTR("File size verification for Unicode database file failed.")); 332 333 return result; 334} 335#endif // USE_MACHO_SEGMENT 336 337typedef struct { 338 uint32_t _numPlanes; 339 const uint8_t **_planes; 340} __CFUniCharBitmapData; 341 342static char __CFUniCharUnicodeVersionString[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 343 344static uint32_t __CFUniCharNumberOfBitmaps = 0; 345static __CFUniCharBitmapData *__CFUniCharBitmapDataArray = NULL; 346 347static CFSpinLock_t __CFUniCharBitmapLock = CFSpinLockInit; 348 349#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 350#if !defined(CF_UNICHAR_BITMAP_FILE) 351#if USE_MACHO_SEGMENT 352#define CF_UNICHAR_BITMAP_FILE "__csbitmaps" 353#else 354#define CF_UNICHAR_BITMAP_FILE "/CFCharacterSetBitmaps.bitmap" 355#endif 356#endif 357#elif DEPLOYMENT_TARGET_WINDOWS 358#if !defined(CF_UNICHAR_BITMAP_FILE) 359#define CF_UNICHAR_BITMAP_FILE L"CFCharacterSetBitmaps.bitmap" 360#endif 361#else 362#error Unknown or unspecified DEPLOYMENT_TARGET 363#endif 364 365static bool __CFUniCharLoadBitmapData(void) { 366 __CFUniCharBitmapData *array; 367 uint32_t headerSize; 368 uint32_t bitmapSize; 369 int numPlanes; 370 uint8_t currentPlane; 371 const void *bytes; 372 const void *bitmapBase; 373 const void *bitmap; 374 int idx, bitmapIndex; 375 int64_t fileSize; 376 377 __CFSpinLock(&__CFUniCharBitmapLock); 378 379 if (__CFUniCharBitmapDataArray || !__CFUniCharLoadFile(CF_UNICHAR_BITMAP_FILE, &bytes, &fileSize) || !__CFSimpleFileSizeVerification(bytes, fileSize)) { 380 __CFSpinUnlock(&__CFUniCharBitmapLock); 381 return false; 382 } 383 384 for (idx = 0;idx < 4 && ((const uint8_t *)bytes)[idx];idx++) { 385 __CFUniCharUnicodeVersionString[idx * 2] = ((const uint8_t *)bytes)[idx]; 386 __CFUniCharUnicodeVersionString[idx * 2 + 1] = '.'; 387 } 388 __CFUniCharUnicodeVersionString[(idx < 4 ? idx * 2 - 1 : 7)] = '\0'; 389 390 headerSize = CFSwapInt32BigToHost(*((uint32_t *)((char *)bytes + 4))); 391 392 bitmapBase = (uint8_t *)bytes + headerSize; 393 bytes = (uint8_t *)bytes + (sizeof(uint32_t) * 2); 394 headerSize -= (sizeof(uint32_t) * 2); 395 396 __CFUniCharNumberOfBitmaps = headerSize / (sizeof(uint32_t) * 2); 397 398 array = (__CFUniCharBitmapData *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(__CFUniCharBitmapData) * __CFUniCharNumberOfBitmaps, 0); 399 400 for (idx = 0;idx < (int)__CFUniCharNumberOfBitmaps;idx++) { 401 bitmap = (uint8_t *)bitmapBase + CFSwapInt32BigToHost(*((uint32_t *)bytes)); bytes = (uint8_t *)bytes + sizeof(uint32_t); 402 bitmapSize = CFSwapInt32BigToHost(*((uint32_t *)bytes)); bytes = (uint8_t *)bytes + sizeof(uint32_t); 403 404 numPlanes = bitmapSize / (8 * 1024); 405 numPlanes = *(const uint8_t *)((char *)bitmap + (((numPlanes - 1) * ((8 * 1024) + 1)) - 1)) + 1; 406 array[idx]._planes = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(const void *) * numPlanes, 0); 407 array[idx]._numPlanes = numPlanes; 408 409 currentPlane = 0; 410 for (bitmapIndex = 0;bitmapIndex < numPlanes;bitmapIndex++) { 411 if (bitmapIndex == currentPlane) { 412 array[idx]._planes[bitmapIndex] = (const uint8_t *)bitmap; 413 bitmap = (uint8_t *)bitmap + (8 * 1024); 414#if defined (__cplusplus) 415 currentPlane = *(((const uint8_t*&)bitmap)++); 416#else 417 currentPlane = *((const uint8_t *)bitmap++); 418#endif 419 420 } else { 421 array[idx]._planes[bitmapIndex] = NULL; 422 } 423 } 424 } 425 426 __CFUniCharBitmapDataArray = array; 427 428 __CFSpinUnlock(&__CFUniCharBitmapLock); 429 430 return true; 431} 432 433CF_PRIVATE const char *__CFUniCharGetUnicodeVersionString(void) { 434 if (NULL == __CFUniCharBitmapDataArray) __CFUniCharLoadBitmapData(); 435 return __CFUniCharUnicodeVersionString; 436} 437 438bool CFUniCharIsMemberOf(UTF32Char theChar, uint32_t charset) { 439 charset = __CFUniCharMapCompatibilitySetID(charset); 440 441 switch (charset) { 442 case kCFUniCharWhitespaceCharacterSet: 443 return isWhitespace(theChar, charset, NULL); 444 445 case kCFUniCharWhitespaceAndNewlineCharacterSet: 446 return isWhitespaceAndNewline(theChar, charset, NULL); 447 448 case kCFUniCharNewlineCharacterSet: 449 return isNewline(theChar, charset, NULL); 450 451 default: { 452 uint32_t tableIndex = __CFUniCharMapExternalSetToInternalIndex(charset); 453 454 if (NULL == __CFUniCharBitmapDataArray) __CFUniCharLoadBitmapData(); 455 456 if (tableIndex < __CFUniCharNumberOfBitmaps) { 457 __CFUniCharBitmapData *data = __CFUniCharBitmapDataArray + tableIndex; 458 uint8_t planeNo = (theChar >> 16) & 0xFF; 459 460 // The bitmap data for kCFUniCharIllegalCharacterSet is actually LEGAL set less Plane 14 ~ 16 461 if (charset == kCFUniCharIllegalCharacterSet) { 462 if (planeNo == 0x0E) { // Plane 14 463 theChar &= 0xFF; 464 return (((theChar == 0x01) || ((theChar > 0x1F) && (theChar < 0x80))) ? false : true); 465 } else if (planeNo == 0x0F || planeNo == 0x10) { // Plane 15 & 16 466 return ((theChar & 0xFF) > 0xFFFD ? true : false); 467 } else { 468 return (planeNo < data->_numPlanes && data->_planes[planeNo] ? !CFUniCharIsMemberOfBitmap(theChar, data->_planes[planeNo]) : true); 469 } 470 } else if (charset == kCFUniCharControlAndFormatterCharacterSet) { 471 if (planeNo == 0x0E) { // Plane 14 472 theChar &= 0xFF; 473 return (((theChar == 0x01) || ((theChar > 0x1F) && (theChar < 0x80))) ? true : false); 474 } else { 475 return (planeNo < data->_numPlanes && data->_planes[planeNo] ? CFUniCharIsMemberOfBitmap(theChar, data->_planes[planeNo]) : false); 476 } 477 } else { 478 return (planeNo < data->_numPlanes && data->_planes[planeNo] ? CFUniCharIsMemberOfBitmap(theChar, data->_planes[planeNo]) : false); 479 } 480 } 481 return false; 482 } 483 } 484} 485 486const uint8_t *CFUniCharGetBitmapPtrForPlane(uint32_t charset, uint32_t plane) { 487 if (NULL == __CFUniCharBitmapDataArray) __CFUniCharLoadBitmapData(); 488 489 charset = __CFUniCharMapCompatibilitySetID(charset); 490 491 if ((charset > kCFUniCharWhitespaceAndNewlineCharacterSet) && (charset != kCFUniCharIllegalCharacterSet) && (charset != kCFUniCharNewlineCharacterSet)) { 492 uint32_t tableIndex = __CFUniCharMapExternalSetToInternalIndex(charset); 493 494 if (tableIndex < __CFUniCharNumberOfBitmaps) { 495 __CFUniCharBitmapData *data = __CFUniCharBitmapDataArray + tableIndex; 496 497 return (plane < data->_numPlanes ? data->_planes[plane] : NULL); 498 } 499 } 500 return NULL; 501} 502 503CF_PRIVATE uint8_t CFUniCharGetBitmapForPlane(uint32_t charset, uint32_t plane, void *bitmap, bool isInverted) { 504 const uint8_t *src = CFUniCharGetBitmapPtrForPlane(charset, plane); 505 int numBytes = (8 * 1024); 506 507 if (src) { 508 if (isInverted) { 509#if defined (__cplusplus) 510 while (numBytes-- > 0) *(((uint8_t *&)bitmap)++) = ~(*(src++)); 511#else 512 while (numBytes-- > 0) *((uint8_t *)bitmap++) = ~(*(src++)); 513#endif 514 } else { 515#if defined (__cplusplus) 516 while (numBytes-- > 0) *(((uint8_t *&)bitmap)++) = *(src++); 517#else 518 while (numBytes-- > 0) *((uint8_t *)bitmap++) = *(src++); 519#endif 520 } 521 return kCFUniCharBitmapFilled; 522 } else if (charset == kCFUniCharIllegalCharacterSet) { 523 __CFUniCharBitmapData *data = __CFUniCharBitmapDataArray + __CFUniCharMapExternalSetToInternalIndex(__CFUniCharMapCompatibilitySetID(charset)); 524 525 if (plane < data->_numPlanes && (src = data->_planes[plane])) { 526 if (isInverted) { 527#if defined (__cplusplus) 528 while (numBytes-- > 0) *(((uint8_t *&)bitmap)++) = *(src++); 529#else 530 while (numBytes-- > 0) *((uint8_t *)bitmap++) = *(src++); 531#endif 532 } else { 533#if defined (__cplusplus) 534 while (numBytes-- > 0) *(((uint8_t *&)bitmap)++) = ~(*(src++)); 535#else 536 while (numBytes-- > 0) *((uint8_t *)bitmap++) = ~(*(src++)); 537#endif 538 } 539 return kCFUniCharBitmapFilled; 540 } else if (plane == 0x0E) { // Plane 14 541 int idx; 542 uint8_t asciiRange = (isInverted ? (uint8_t)0xFF : (uint8_t)0); 543 uint8_t otherRange = (isInverted ? (uint8_t)0 : (uint8_t)0xFF); 544 545#if defined (__cplusplus) 546 *(((uint8_t *&)bitmap)++) = 0x02; // UE0001 LANGUAGE TAG 547#else 548 *((uint8_t *)bitmap++) = 0x02; // UE0001 LANGUAGE TAG 549#endif 550 for (idx = 1;idx < numBytes;idx++) { 551#if defined (__cplusplus) 552 *(((uint8_t *&)bitmap)++) = ((idx >= (0x20 / 8) && (idx < (0x80 / 8))) ? asciiRange : otherRange); 553#else 554 *((uint8_t *)bitmap++) = ((idx >= (0x20 / 8) && (idx < (0x80 / 8))) ? asciiRange : otherRange); 555#endif 556 } 557 return kCFUniCharBitmapFilled; 558 } else if (plane == 0x0F || plane == 0x10) { // Plane 15 & 16 559 uint32_t value = (isInverted ? ~0 : 0); 560 numBytes /= 4; // for 32bit 561 562 while (numBytes-- > 0) { 563 *((uint32_t *)bitmap) = value; 564#if defined (__cplusplus) 565 bitmap = (uint8_t *)bitmap + sizeof(uint32_t); 566#else 567 bitmap += sizeof(uint32_t); 568#endif 569 } 570 *(((uint8_t *)bitmap) - 5) = (isInverted ? 0x3F : 0xC0); // 0xFFFE & 0xFFFF 571 return kCFUniCharBitmapFilled; 572 } 573 return (isInverted ? kCFUniCharBitmapEmpty : kCFUniCharBitmapAll); 574 } else if ((charset < kCFUniCharDecimalDigitCharacterSet) || (charset == kCFUniCharNewlineCharacterSet)) { 575 if (plane) return (isInverted ? kCFUniCharBitmapAll : kCFUniCharBitmapEmpty); 576 577 uint8_t *bitmapBase = (uint8_t *)bitmap; 578 CFIndex idx; 579 uint8_t nonFillValue = (isInverted ? (uint8_t)0xFF : (uint8_t)0); 580 581#if defined (__cplusplus) 582 while (numBytes-- > 0) *(((uint8_t *&)bitmap)++) = nonFillValue; 583#else 584 while (numBytes-- > 0) *((uint8_t *)bitmap++) = nonFillValue; 585#endif 586 587 if ((charset == kCFUniCharWhitespaceAndNewlineCharacterSet) || (charset == kCFUniCharNewlineCharacterSet)) { 588 const UniChar newlines[] = {0x000A, 0x000B, 0x000C, 0x000D, 0x0085, 0x2028, 0x2029}; 589 590 for (idx = 0;idx < (int)(sizeof(newlines) / sizeof(*newlines)); idx++) { 591 if (isInverted) { 592 CFUniCharRemoveCharacterFromBitmap(newlines[idx], bitmapBase); 593 } else { 594 CFUniCharAddCharacterToBitmap(newlines[idx], bitmapBase); 595 } 596 } 597 598 if (charset == kCFUniCharNewlineCharacterSet) return kCFUniCharBitmapFilled; 599 } 600 601 if (isInverted) { 602 CFUniCharRemoveCharacterFromBitmap(0x0009, bitmapBase); 603 CFUniCharRemoveCharacterFromBitmap(0x0020, bitmapBase); 604 CFUniCharRemoveCharacterFromBitmap(0x00A0, bitmapBase); 605 CFUniCharRemoveCharacterFromBitmap(0x1680, bitmapBase); 606 CFUniCharRemoveCharacterFromBitmap(0x202F, bitmapBase); 607 CFUniCharRemoveCharacterFromBitmap(0x205F, bitmapBase); 608 CFUniCharRemoveCharacterFromBitmap(0x3000, bitmapBase); 609 } else { 610 CFUniCharAddCharacterToBitmap(0x0009, bitmapBase); 611 CFUniCharAddCharacterToBitmap(0x0020, bitmapBase); 612 CFUniCharAddCharacterToBitmap(0x00A0, bitmapBase); 613 CFUniCharAddCharacterToBitmap(0x1680, bitmapBase); 614 CFUniCharAddCharacterToBitmap(0x202F, bitmapBase); 615 CFUniCharAddCharacterToBitmap(0x205F, bitmapBase); 616 CFUniCharAddCharacterToBitmap(0x3000, bitmapBase); 617 } 618 619 for (idx = 0x2000;idx <= 0x200B;idx++) { 620 if (isInverted) { 621 CFUniCharRemoveCharacterFromBitmap(idx, bitmapBase); 622 } else { 623 CFUniCharAddCharacterToBitmap(idx, bitmapBase); 624 } 625 } 626 return kCFUniCharBitmapFilled; 627 } 628 return (isInverted ? kCFUniCharBitmapAll : kCFUniCharBitmapEmpty); 629} 630 631CF_PRIVATE uint32_t CFUniCharGetNumberOfPlanes(uint32_t charset) { 632 if ((charset == kCFUniCharControlCharacterSet) || (charset == kCFUniCharControlAndFormatterCharacterSet)) { 633 return 15; // 0 to 14 634 } else if (charset < kCFUniCharDecimalDigitCharacterSet) { 635 return 1; 636 } else if (charset == kCFUniCharIllegalCharacterSet) { 637 return 17; 638 } else { 639 uint32_t numPlanes; 640 641 if (NULL == __CFUniCharBitmapDataArray) __CFUniCharLoadBitmapData(); 642 643 numPlanes = __CFUniCharBitmapDataArray[__CFUniCharMapExternalSetToInternalIndex(__CFUniCharMapCompatibilitySetID(charset))]._numPlanes; 644 645 return numPlanes; 646 } 647} 648 649// Mapping data loading 650static const void **__CFUniCharMappingTables = NULL; 651 652static CFSpinLock_t __CFUniCharMappingTableLock = CFSpinLockInit; 653 654#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 655#if __CF_BIG_ENDIAN__ 656#if USE_MACHO_SEGMENT 657#define MAPPING_TABLE_FILE "__data" 658#else 659#define MAPPING_TABLE_FILE "/CFUnicodeData-B.mapping" 660#endif 661#else 662#if USE_MACHO_SEGMENT 663#define MAPPING_TABLE_FILE "__data" 664#else 665#define MAPPING_TABLE_FILE "/CFUnicodeData-L.mapping" 666#endif 667#endif 668#elif DEPLOYMENT_TARGET_WINDOWS 669#if __CF_BIG_ENDIAN__ 670#if USE_MACHO_SEGMENT 671#define MAPPING_TABLE_FILE "__data" 672#else 673#define MAPPING_TABLE_FILE L"CFUnicodeData-B.mapping" 674#endif 675#else 676#if USE_MACHO_SEGMENT 677#define MAPPING_TABLE_FILE "__data" 678#else 679#define MAPPING_TABLE_FILE L"CFUnicodeData-L.mapping" 680#endif 681#endif 682#else 683#error Unknown or unspecified DEPLOYMENT_TARGET 684#endif 685 686CF_PRIVATE const void *CFUniCharGetMappingData(uint32_t type) { 687 688 __CFSpinLock(&__CFUniCharMappingTableLock); 689 690 if (NULL == __CFUniCharMappingTables) { 691 const void *bytes; 692 const void *bodyBase; 693 int headerSize; 694 int idx, count; 695 int64_t fileSize; 696 697 if (!__CFUniCharLoadFile(MAPPING_TABLE_FILE, &bytes, &fileSize) || !__CFSimpleFileSizeVerification(bytes, fileSize)) { 698 __CFSpinUnlock(&__CFUniCharMappingTableLock); 699 return NULL; 700 } 701 702#if defined (__cplusplus) 703 bytes = (uint8_t *)bytes + 4; // Skip Unicode version 704 headerSize = *((uint8_t *)bytes); bytes = (uint8_t *)bytes + sizeof(uint32_t); 705#else 706 bytes += 4; // Skip Unicode version 707 headerSize = *((uint32_t *)bytes); bytes += sizeof(uint32_t); 708#endif 709 headerSize -= (sizeof(uint32_t) * 2); 710 bodyBase = (char *)bytes + headerSize; 711 712 count = headerSize / sizeof(uint32_t); 713 714 __CFUniCharMappingTables = (const void **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(const void *) * count, 0); 715 716 for (idx = 0;idx < count;idx++) { 717#if defined (__cplusplus) 718 __CFUniCharMappingTables[idx] = (char *)bodyBase + *((uint32_t *)bytes); bytes = (uint8_t *)bytes + sizeof(uint32_t); 719#else 720 __CFUniCharMappingTables[idx] = (char *)bodyBase + *((uint32_t *)bytes); bytes += sizeof(uint32_t); 721#endif 722 } 723 } 724 725 __CFSpinUnlock(&__CFUniCharMappingTableLock); 726 727 return __CFUniCharMappingTables[type]; 728} 729 730// Case mapping functions 731#define DO_SPECIAL_CASE_MAPPING 1 732 733static uint32_t *__CFUniCharCaseMappingTableCounts = NULL; 734static uint32_t **__CFUniCharCaseMappingTable = NULL; 735static const uint32_t **__CFUniCharCaseMappingExtraTable = NULL; 736 737typedef struct { 738 uint32_t _key; 739 uint32_t _value; 740} __CFUniCharCaseMappings; 741 742/* Binary searches CFStringEncodingUnicodeTo8BitCharMap */ 743static uint32_t __CFUniCharGetMappedCase(const __CFUniCharCaseMappings *theTable, uint32_t numElem, UTF32Char character) { 744 const __CFUniCharCaseMappings *p, *q, *divider; 745 746 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) { 747 return 0; 748 } 749 p = theTable; 750 q = p + (numElem-1); 751 while (p <= q) { 752 divider = p + ((q - p) >> 1); /* divide by 2 */ 753 if (character < divider->_key) { q = divider - 1; } 754 else if (character > divider->_key) { p = divider + 1; } 755 else { return divider->_value; } 756 } 757 return 0; 758} 759 760#define NUM_CASE_MAP_DATA (kCFUniCharCaseFold + 1) 761 762static bool __CFUniCharLoadCaseMappingTable(void) { 763 uint32_t *countArray; 764 int idx; 765 766 if (NULL == __CFUniCharMappingTables) (void)CFUniCharGetMappingData(kCFUniCharToLowercase); 767 if (NULL == __CFUniCharMappingTables) return false; 768 769 __CFSpinLock(&__CFUniCharMappingTableLock); 770 771 if (__CFUniCharCaseMappingTableCounts) { 772 __CFSpinUnlock(&__CFUniCharMappingTableLock); 773 return true; 774 } 775 776 countArray = (uint32_t *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint32_t) * NUM_CASE_MAP_DATA + sizeof(uint32_t *) * NUM_CASE_MAP_DATA * 2, 0); 777 __CFUniCharCaseMappingTable = (uint32_t **)((char *)countArray + sizeof(uint32_t) * NUM_CASE_MAP_DATA); 778 __CFUniCharCaseMappingExtraTable = (const uint32_t **)__CFUniCharCaseMappingTable + NUM_CASE_MAP_DATA; 779 780 for (idx = 0;idx < NUM_CASE_MAP_DATA;idx++) { 781 countArray[idx] = *((uint32_t *)__CFUniCharMappingTables[idx]) / (sizeof(uint32_t) * 2); 782 __CFUniCharCaseMappingTable[idx] = ((uint32_t *)__CFUniCharMappingTables[idx]) + 1; 783 __CFUniCharCaseMappingExtraTable[idx] = (const uint32_t *)((char *)__CFUniCharCaseMappingTable[idx] + *((uint32_t *)__CFUniCharMappingTables[idx])); 784 } 785 786 __CFUniCharCaseMappingTableCounts = countArray; 787 788 __CFSpinUnlock(&__CFUniCharMappingTableLock); 789 return true; 790} 791 792#if __CF_BIG_ENDIAN__ 793#define TURKISH_LANG_CODE (0x7472) // tr 794#define LITHUANIAN_LANG_CODE (0x6C74) // lt 795#define AZERI_LANG_CODE (0x617A) // az 796#define DUTCH_LANG_CODE (0x6E6C) // nl 797#define GREEK_LANG_CODE (0x656C) // el 798#else 799#define TURKISH_LANG_CODE (0x7274) // tr 800#define LITHUANIAN_LANG_CODE (0x746C) // lt 801#define AZERI_LANG_CODE (0x7A61) // az 802#define DUTCH_LANG_CODE (0x6C6E) // nl 803#define GREEK_LANG_CODE (0x6C65) // el 804#endif 805 806CFIndex CFUniCharMapCaseTo(UTF32Char theChar, UTF16Char *convertedChar, CFIndex maxLength, uint32_t ctype, uint32_t flags, const uint8_t *langCode) { 807 __CFUniCharBitmapData *data; 808 uint8_t planeNo = (theChar >> 16) & 0xFF; 809 810caseFoldRetry: 811 812#if DO_SPECIAL_CASE_MAPPING 813 if (flags & kCFUniCharCaseMapFinalSigma) { 814 if (theChar == 0x03A3) { // Final sigma 815 *convertedChar = (ctype == kCFUniCharToLowercase ? 0x03C2 : 0x03A3); 816 return 1; 817 } 818 } 819 820 if (langCode) { 821 if (flags & kCFUniCharCaseMapGreekTonos) { // localized Greek uppercasing 822 if (theChar == 0x0301) { // GREEK TONOS 823 return 0; 824 } else if (theChar == 0x0344) {// COMBINING GREEK DIALYTIKA TONOS 825 *convertedChar = 0x0308; // COMBINING GREEK DIALYTIKA 826 return 1; 827 } else if (CFUniCharIsMemberOf(theChar, kCFUniCharDecomposableCharacterSet)) { 828 UTF32Char buffer[MAX_DECOMPOSED_LENGTH]; 829 CFIndex length = CFUniCharDecomposeCharacter(theChar, buffer, MAX_DECOMPOSED_LENGTH); 830 831 if (length > 1) { 832 UTF32Char *characters = buffer + 1; 833 UTF32Char *tail = buffer + length; 834 835 while (characters < tail) { 836 if (*characters == 0x0301) break; 837 ++characters; 838 } 839 840 if (characters < tail) { // found a tonos 841 CFIndex convertedLength = CFUniCharMapCaseTo(*buffer, convertedChar, maxLength, ctype, 0, langCode); 842 843 if (convertedLength == 0) { 844 *convertedChar = (UTF16Char)*buffer; 845 convertedLength = 1; 846 } 847 848 characters = buffer + 1; 849 850 while (characters < tail) { 851 if (*characters != 0x0301) { // not tonos 852 if (*characters < 0x10000) { // BMP 853 convertedChar[convertedLength] = (UTF16Char)*characters; 854 ++convertedLength; 855 } else { 856 UTF32Char character = *characters - 0x10000; 857 convertedChar[convertedLength++] = (UTF16Char)((character >> 10) + 0xD800UL); 858 convertedChar[convertedLength++] = (UTF16Char)((character & 0x3FF) + 0xDC00UL); 859 } 860 } 861 ++characters; 862 } 863 864 return convertedLength; 865 } 866 } 867 } 868 } 869 switch (*(uint16_t *)langCode) { 870 case LITHUANIAN_LANG_CODE: 871 if (theChar == 0x0307 && (flags & kCFUniCharCaseMapAfter_i)) { 872 return 0; 873 } else if (ctype == kCFUniCharToLowercase) { 874 if (flags & kCFUniCharCaseMapMoreAbove) { 875 switch (theChar) { 876 case 0x0049: // LATIN CAPITAL LETTER I 877 *(convertedChar++) = 0x0069; 878 *(convertedChar++) = 0x0307; 879 return 2; 880 881 case 0x004A: // LATIN CAPITAL LETTER J 882 *(convertedChar++) = 0x006A; 883 *(convertedChar++) = 0x0307; 884 return 2; 885 886 case 0x012E: // LATIN CAPITAL LETTER I WITH OGONEK 887 *(convertedChar++) = 0x012F; 888 *(convertedChar++) = 0x0307; 889 return 2; 890 891 default: break; 892 } 893 } 894 switch (theChar) { 895 case 0x00CC: // LATIN CAPITAL LETTER I WITH GRAVE 896 *(convertedChar++) = 0x0069; 897 *(convertedChar++) = 0x0307; 898 *(convertedChar++) = 0x0300; 899 return 3; 900 901 case 0x00CD: // LATIN CAPITAL LETTER I WITH ACUTE 902 *(convertedChar++) = 0x0069; 903 *(convertedChar++) = 0x0307; 904 *(convertedChar++) = 0x0301; 905 return 3; 906 907 case 0x0128: // LATIN CAPITAL LETTER I WITH TILDE 908 *(convertedChar++) = 0x0069; 909 *(convertedChar++) = 0x0307; 910 *(convertedChar++) = 0x0303; 911 return 3; 912 913 default: break; 914 } 915 } 916 break; 917 918 case TURKISH_LANG_CODE: 919 case AZERI_LANG_CODE: 920 if ((theChar == 0x0049) || (theChar == 0x0131)) { // LATIN CAPITAL LETTER I & LATIN SMALL LETTER DOTLESS I 921 *convertedChar = (((ctype == kCFUniCharToLowercase) || (ctype == kCFUniCharCaseFold)) ? ((kCFUniCharCaseMapMoreAbove & flags) ? 0x0069 : 0x0131) : 0x0049); 922 return 1; 923 } else if ((theChar == 0x0069) || (theChar == 0x0130)) { // LATIN SMALL LETTER I & LATIN CAPITAL LETTER I WITH DOT ABOVE 924 *convertedChar = (((ctype == kCFUniCharToLowercase) || (ctype == kCFUniCharCaseFold)) ? 0x0069 : 0x0130); 925 return 1; 926 } else if (theChar == 0x0307 && (kCFUniCharCaseMapAfter_i & flags)) { // COMBINING DOT ABOVE AFTER_i 927 if (ctype == kCFUniCharToLowercase) { 928 return 0; 929 } else { 930 *convertedChar = 0x0307; 931 return 1; 932 } 933 } 934 break; 935 936 case DUTCH_LANG_CODE: 937 if ((theChar == 0x004A) || (theChar == 0x006A)) { 938 *convertedChar = (((ctype == kCFUniCharToUppercase) || (ctype == kCFUniCharToTitlecase) || (kCFUniCharCaseMapDutchDigraph & flags)) ? 0x004A : 0x006A); 939 return 1; 940 } 941 break; 942 943 default: break; 944 } 945 } 946#endif // DO_SPECIAL_CASE_MAPPING 947 948 if (NULL == __CFUniCharBitmapDataArray) __CFUniCharLoadBitmapData(); 949 950 data = __CFUniCharBitmapDataArray + __CFUniCharMapExternalSetToInternalIndex(__CFUniCharMapCompatibilitySetID(ctype + kCFUniCharHasNonSelfLowercaseCharacterSet)); 951 952 if (planeNo < data->_numPlanes && data->_planes[planeNo] && CFUniCharIsMemberOfBitmap(theChar, data->_planes[planeNo]) && (__CFUniCharCaseMappingTableCounts || __CFUniCharLoadCaseMappingTable())) { 953 uint32_t value = __CFUniCharGetMappedCase((const __CFUniCharCaseMappings *)__CFUniCharCaseMappingTable[ctype], __CFUniCharCaseMappingTableCounts[ctype], theChar); 954 955 if (!value && ctype == kCFUniCharToTitlecase) { 956 value = __CFUniCharGetMappedCase((const __CFUniCharCaseMappings *)__CFUniCharCaseMappingTable[kCFUniCharToUppercase], __CFUniCharCaseMappingTableCounts[kCFUniCharToUppercase], theChar); 957 if (value) ctype = kCFUniCharToUppercase; 958 } 959 960 if (value) { 961 CFIndex count = CFUniCharConvertFlagToCount(value); 962 963 if (count == 1) { 964 if (value & kCFUniCharNonBmpFlag) { 965 if (maxLength > 1) { 966 value = (value & 0xFFFFFF) - 0x10000; 967 *(convertedChar++) = (UTF16Char)(value >> 10) + 0xD800UL; 968 *(convertedChar++) = (UTF16Char)(value & 0x3FF) + 0xDC00UL; 969 return 2; 970 } 971 } else { 972 *convertedChar = (UTF16Char)value; 973 return 1; 974 } 975 } else if (count < maxLength) { 976 const uint32_t *extraMapping = __CFUniCharCaseMappingExtraTable[ctype] + (value & 0xFFFFFF); 977 978 if (value & kCFUniCharNonBmpFlag) { 979 CFIndex copiedLen = 0; 980 981 while (count-- > 0) { 982 value = *(extraMapping++); 983 if (value > 0xFFFF) { 984 if (copiedLen + 2 >= maxLength) break; 985 value = (value & 0xFFFFFF) - 0x10000; 986 convertedChar[copiedLen++] = (UTF16Char)(value >> 10) + 0xD800UL; 987 convertedChar[copiedLen++] = (UTF16Char)(value & 0x3FF) + 0xDC00UL; 988 } else { 989 if (copiedLen + 1 >= maxLength) break; 990 convertedChar[copiedLen++] = value; 991 } 992 } 993 if (!count) return copiedLen; 994 } else { 995 CFIndex idx; 996 997 for (idx = 0;idx < count;idx++) *(convertedChar++) = (UTF16Char)*(extraMapping++); 998 return count; 999 } 1000 } 1001 } 1002 } else if (ctype == kCFUniCharCaseFold) { 1003 ctype = kCFUniCharToLowercase; 1004 goto caseFoldRetry; 1005 } 1006 1007 if (theChar > 0xFFFF) { // non-BMP 1008 theChar = (theChar & 0xFFFFFF) - 0x10000; 1009 *(convertedChar++) = (UTF16Char)(theChar >> 10) + 0xD800UL; 1010 *(convertedChar++) = (UTF16Char)(theChar & 0x3FF) + 0xDC00UL; 1011 return 2; 1012 } else { 1013 *convertedChar = theChar; 1014 return 1; 1015 } 1016} 1017 1018CFIndex CFUniCharMapTo(UniChar theChar, UniChar *convertedChar, CFIndex maxLength, uint16_t ctype, uint32_t flags) { 1019 if (ctype == kCFUniCharCaseFold + 1) { // kCFUniCharDecompose 1020 if (CFUniCharIsDecomposableCharacter(theChar, false)) { 1021 UTF32Char buffer[MAX_DECOMPOSED_LENGTH]; 1022 CFIndex usedLength = CFUniCharDecomposeCharacter(theChar, buffer, MAX_DECOMPOSED_LENGTH); 1023 CFIndex idx; 1024 1025 for (idx = 0;idx < usedLength;idx++) *(convertedChar++) = buffer[idx]; 1026 return usedLength; 1027 } else { 1028 *convertedChar = theChar; 1029 return 1; 1030 } 1031 } else { 1032 return CFUniCharMapCaseTo(theChar, convertedChar, maxLength, ctype, flags, NULL); 1033 } 1034} 1035 1036CF_INLINE bool __CFUniCharIsMoreAbove(UTF16Char *buffer, CFIndex length) { 1037 UTF32Char currentChar; 1038 uint32_t property; 1039 1040 while (length-- > 0) { 1041 currentChar = *(buffer)++; 1042 if (CFUniCharIsSurrogateHighCharacter(currentChar) && (length > 0) && CFUniCharIsSurrogateLowCharacter(*(buffer + 1))) { 1043 currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(buffer++)); 1044 --length; 1045 } 1046 if (!CFUniCharIsMemberOf(currentChar, kCFUniCharNonBaseCharacterSet)) break; 1047 1048 property = CFUniCharGetCombiningPropertyForCharacter(currentChar, (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, (currentChar >> 16) & 0xFF)); 1049 1050 if (property == 230) return true; // Above priority 1051 } 1052 return false; 1053} 1054 1055CF_INLINE bool __CFUniCharIsAfter_i(UTF16Char *buffer, CFIndex length) { 1056 UTF32Char currentChar = 0; 1057 uint32_t property; 1058 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; 1059 CFIndex decompLength; 1060 CFIndex idx; 1061 1062 if (length < 1) return 0; 1063 1064 buffer += length; 1065 while (length-- > 1) { 1066 currentChar = *(--buffer); 1067 if (CFUniCharIsSurrogateLowCharacter(currentChar)) { 1068 if ((length > 1) && CFUniCharIsSurrogateHighCharacter(*(buffer - 1))) { 1069 currentChar = CFUniCharGetLongCharacterForSurrogatePair(*(--buffer), currentChar); 1070 --length; 1071 } else { 1072 break; 1073 } 1074 } 1075 if (!CFUniCharIsMemberOf(currentChar, kCFUniCharNonBaseCharacterSet)) break; 1076 1077 property = CFUniCharGetCombiningPropertyForCharacter(currentChar, (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, (currentChar >> 16) & 0xFF)); 1078 1079 if (property == 230) return false; // Above priority 1080 } 1081 if (length == 0) { 1082 currentChar = *(--buffer); 1083 } else if (CFUniCharIsSurrogateLowCharacter(currentChar) && CFUniCharIsSurrogateHighCharacter(*(--buffer))) { 1084 currentChar = CFUniCharGetLongCharacterForSurrogatePair(*buffer, currentChar); 1085 } 1086 1087 decompLength = CFUniCharDecomposeCharacter(currentChar, decomposed, MAX_DECOMPOSED_LENGTH); 1088 currentChar = *decomposed; 1089 1090 1091 for (idx = 1;idx < decompLength;idx++) { 1092 currentChar = decomposed[idx]; 1093 property = CFUniCharGetCombiningPropertyForCharacter(currentChar, (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, (currentChar >> 16) & 0xFF)); 1094 1095 if (property == 230) return false; // Above priority 1096 } 1097 return true; 1098} 1099 1100CF_PRIVATE uint32_t CFUniCharGetConditionalCaseMappingFlags(UTF32Char theChar, UTF16Char *buffer, CFIndex currentIndex, CFIndex length, uint32_t type, const uint8_t *langCode, uint32_t lastFlags) { 1101 if (theChar == 0x03A3) { // GREEK CAPITAL LETTER SIGMA 1102 if ((type == kCFUniCharToLowercase) && (currentIndex > 0)) { 1103 UTF16Char *start = buffer; 1104 UTF16Char *end = buffer + length; 1105 UTF32Char otherChar; 1106 1107 // First check if we're after a cased character 1108 buffer += (currentIndex - 1); 1109 while (start <= buffer) { 1110 otherChar = *(buffer--); 1111 if (CFUniCharIsSurrogateLowCharacter(otherChar) && (start <= buffer) && CFUniCharIsSurrogateHighCharacter(*buffer)) { 1112 otherChar = CFUniCharGetLongCharacterForSurrogatePair(*(buffer--), otherChar); 1113 } 1114 if (!CFUniCharIsMemberOf(otherChar, kCFUniCharCaseIgnorableCharacterSet)) { 1115 if (!CFUniCharIsMemberOf(otherChar, kCFUniCharUppercaseLetterCharacterSet) && !CFUniCharIsMemberOf(otherChar, kCFUniCharLowercaseLetterCharacterSet)) return 0; // Uppercase set contains titlecase 1116 break; 1117 } 1118 } 1119 1120 // Next check if we're before a cased character 1121 buffer = start + currentIndex + 1; 1122 while (buffer < end) { 1123 otherChar = *(buffer++); 1124 if (CFUniCharIsSurrogateHighCharacter(otherChar) && (buffer < end) && CFUniCharIsSurrogateLowCharacter(*buffer)) { 1125 otherChar = CFUniCharGetLongCharacterForSurrogatePair(otherChar, *(buffer++)); 1126 } 1127 if (!CFUniCharIsMemberOf(otherChar, kCFUniCharCaseIgnorableCharacterSet)) { 1128 if (CFUniCharIsMemberOf(otherChar, kCFUniCharUppercaseLetterCharacterSet) || CFUniCharIsMemberOf(otherChar, kCFUniCharLowercaseLetterCharacterSet)) return 0; // Uppercase set contains titlecase 1129 break; 1130 } 1131 } 1132 return kCFUniCharCaseMapFinalSigma; 1133 } 1134 } else if (langCode) { 1135 if (*((const uint16_t *)langCode) == LITHUANIAN_LANG_CODE) { 1136 if ((theChar == 0x0307) && ((kCFUniCharCaseMapAfter_i|kCFUniCharCaseMapMoreAbove) & lastFlags) == (kCFUniCharCaseMapAfter_i|kCFUniCharCaseMapMoreAbove)) { 1137 return (__CFUniCharIsAfter_i(buffer, currentIndex) ? kCFUniCharCaseMapAfter_i : 0); 1138 } else if (type == kCFUniCharToLowercase) { 1139 if ((theChar == 0x0049) || (theChar == 0x004A) || (theChar == 0x012E)) { 1140 ++currentIndex; 1141 return (__CFUniCharIsMoreAbove(buffer + currentIndex, length - currentIndex) ? kCFUniCharCaseMapMoreAbove : 0); 1142 } 1143 } else if ((theChar == 'i') || (theChar == 'j')) { 1144 ++currentIndex; 1145 return (__CFUniCharIsMoreAbove(buffer + currentIndex, length - currentIndex) ? (kCFUniCharCaseMapAfter_i|kCFUniCharCaseMapMoreAbove) : 0); 1146 } 1147 } else if ((*((const uint16_t *)langCode) == TURKISH_LANG_CODE) || (*((const uint16_t *)langCode) == AZERI_LANG_CODE)) { 1148 if (type == kCFUniCharToLowercase) { 1149 if (theChar == 0x0307) { 1150 return (kCFUniCharCaseMapMoreAbove & lastFlags ? kCFUniCharCaseMapAfter_i : 0); 1151 } else if (theChar == 0x0049) { 1152 return (((++currentIndex < length) && (buffer[currentIndex] == 0x0307)) ? kCFUniCharCaseMapMoreAbove : 0); 1153 } 1154 } 1155 } else if (*((const uint16_t *)langCode) == DUTCH_LANG_CODE) { 1156 if (kCFUniCharCaseMapDutchDigraph & lastFlags) { 1157 return (((theChar == 0x006A) || (theChar == 0x004A)) ? kCFUniCharCaseMapDutchDigraph : 0); 1158 } else { 1159 if ((type == kCFUniCharToTitlecase) && ((theChar == 0x0069) || (theChar == 0x0049))) { 1160 return (((++currentIndex < length) && ((buffer[currentIndex] == 0x006A) || (buffer[currentIndex] == 0x004A))) ? kCFUniCharCaseMapDutchDigraph : 0); 1161 } 1162 } 1163 } 1164 1165 if (kCFUniCharCaseMapGreekTonos & lastFlags) { // still searching for tonos 1166 if (CFUniCharIsMemberOf(theChar, kCFUniCharNonBaseCharacterSet)) { 1167 return kCFUniCharCaseMapGreekTonos; 1168 } 1169 } 1170 if (((theChar >= 0x0370) && (theChar < 0x0400)) || ((theChar >= 0x1F00) && (theChar < 0x2000))) { // Greek/Coptic & Greek extended ranges 1171 if ((type == kCFUniCharToUppercase) && (CFUniCharIsMemberOf(theChar, kCFUniCharLetterCharacterSet))) return kCFUniCharCaseMapGreekTonos; 1172 } 1173 } 1174 return 0; 1175} 1176 1177// Unicode property database 1178static __CFUniCharBitmapData *__CFUniCharUnicodePropertyTable = NULL; 1179static int __CFUniCharUnicodePropertyTableCount = 0; 1180 1181static CFSpinLock_t __CFUniCharPropTableLock = CFSpinLockInit; 1182 1183#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 1184#if USE_MACHO_SEGMENT 1185#define PROP_DB_FILE "__properties" 1186#else 1187#define PROP_DB_FILE "/CFUniCharPropertyDatabase.data" 1188#endif 1189#elif DEPLOYMENT_TARGET_WINDOWS 1190#if USE_MACHO_SEGMENT 1191#define PROP_DB_FILE "__properties" 1192#else 1193#define PROP_DB_FILE L"CFUniCharPropertyDatabase.data" 1194#endif 1195#else 1196#error Unknown or unspecified DEPLOYMENT_TARGET 1197#endif 1198 1199const void *CFUniCharGetUnicodePropertyDataForPlane(uint32_t propertyType, uint32_t plane) { 1200 1201 __CFSpinLock(&__CFUniCharPropTableLock); 1202 1203 if (NULL == __CFUniCharUnicodePropertyTable) { 1204 __CFUniCharBitmapData *table; 1205 const void *bytes; 1206 const void *bodyBase; 1207 const void *planeBase; 1208 int headerSize; 1209 int idx, count; 1210 int planeIndex, planeCount; 1211 int planeSize; 1212 int64_t fileSize; 1213 1214 if (!__CFUniCharLoadFile(PROP_DB_FILE, &bytes, &fileSize) || !__CFSimpleFileSizeVerification(bytes, fileSize)) { 1215 __CFSpinUnlock(&__CFUniCharPropTableLock); 1216 return NULL; 1217 } 1218 1219#if defined (__cplusplus) 1220 bytes = (uint8_t*)bytes + 4; // Skip Unicode version 1221 headerSize = CFSwapInt32BigToHost(*((uint32_t *)bytes)); bytes = (uint8_t *)bytes + sizeof(uint32_t); 1222#else 1223 bytes += 4; // Skip Unicode version 1224 headerSize = CFSwapInt32BigToHost(*((uint32_t *)bytes)); bytes += sizeof(uint32_t); 1225#endif 1226 1227 headerSize -= (sizeof(uint32_t) * 2); 1228 bodyBase = (char *)bytes + headerSize; 1229 1230 count = headerSize / sizeof(uint32_t); 1231 __CFUniCharUnicodePropertyTableCount = count; 1232 1233 table = (__CFUniCharBitmapData *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(__CFUniCharBitmapData) * count, 0); 1234 1235 for (idx = 0;idx < count;idx++) { 1236 planeCount = *((const uint8_t *)bodyBase); 1237 planeBase = (char *)bodyBase + planeCount + (planeCount % 4 ? 4 - (planeCount % 4) : 0); 1238 table[idx]._planes = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(const void *) * planeCount, 0); 1239 1240 for (planeIndex = 0;planeIndex < planeCount;planeIndex++) { 1241 if ((planeSize = ((const uint8_t *)bodyBase)[planeIndex + 1])) { 1242 table[idx]._planes[planeIndex] = (const uint8_t *)planeBase; 1243#if defined (__cplusplus) 1244 planeBase = (char*)planeBase + (planeSize * 256); 1245#else 1246 planeBase += (planeSize * 256); 1247#endif 1248 } else { 1249 table[idx]._planes[planeIndex] = NULL; 1250 } 1251 } 1252 1253 table[idx]._numPlanes = planeCount; 1254#if defined (__cplusplus) 1255 bodyBase = (const uint8_t *)bodyBase + (CFSwapInt32BigToHost(*(uint32_t *)bytes)); 1256 ((uint32_t *&)bytes) ++; 1257#else 1258 bodyBase += (CFSwapInt32BigToHost(*((uint32_t *)bytes++))); 1259#endif 1260 } 1261 1262 __CFUniCharUnicodePropertyTable = table; 1263 } 1264 1265 __CFSpinUnlock(&__CFUniCharPropTableLock); 1266 1267 return (plane < __CFUniCharUnicodePropertyTable[propertyType]._numPlanes ? __CFUniCharUnicodePropertyTable[propertyType]._planes[plane] : NULL); 1268} 1269 1270CF_PRIVATE uint32_t CFUniCharGetNumberOfPlanesForUnicodePropertyData(uint32_t propertyType) { 1271 (void)CFUniCharGetUnicodePropertyDataForPlane(propertyType, 0); 1272 return __CFUniCharUnicodePropertyTable[propertyType]._numPlanes; 1273} 1274 1275CF_PRIVATE uint32_t CFUniCharGetUnicodeProperty(UTF32Char character, uint32_t propertyType) { 1276 if (propertyType == kCFUniCharCombiningProperty) { 1277 return CFUniCharGetCombiningPropertyForCharacter(character, (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(propertyType, (character >> 16) & 0xFF)); 1278 } else if (propertyType == kCFUniCharBidiProperty) { 1279 return CFUniCharGetBidiPropertyForCharacter(character, (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(propertyType, (character >> 16) & 0xFF)); 1280 } else { 1281 return 0; 1282 } 1283} 1284 1285 1286 1287/* 1288 The UTF8 conversion in the following function is derived from ConvertUTF.c 1289*/ 1290/* 1291 * Copyright 2001 Unicode, Inc. 1292 * 1293 * Disclaimer 1294 * 1295 * This source code is provided as is by Unicode, Inc. No claims are 1296 * made as to fitness for any particular purpose. No warranties of any 1297 * kind are expressed or implied. The recipient agrees to determine 1298 * applicability of information provided. If this file has been 1299 * purchased on magnetic or optical media from Unicode, Inc., the 1300 * sole remedy for any claim will be exchange of defective media 1301 * within 90 days of receipt. 1302 * 1303 * Limitations on Rights to Redistribute This Code 1304 * 1305 * Unicode, Inc. hereby grants the right to freely use the information 1306 * supplied in this file in the creation of products supporting the 1307 * Unicode Standard, and to make copies of this file in any form 1308 * for internal or external distribution as long as this notice 1309 * remains attached. 1310 */ 1311#define UNI_REPLACEMENT_CHAR (0x0000FFFDUL) 1312 1313bool CFUniCharFillDestinationBuffer(const UTF32Char *src, CFIndex srcLength, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) { 1314 UTF32Char currentChar; 1315 CFIndex usedLength = *filledLength; 1316 1317 if (dstFormat == kCFUniCharUTF16Format) { 1318 UTF16Char *dstBuffer = (UTF16Char *)*dst; 1319 1320 while (srcLength-- > 0) { 1321 currentChar = *(src++); 1322 1323 if (currentChar > 0xFFFF) { // Non-BMP 1324 usedLength += 2; 1325 if (dstLength) { 1326 if (usedLength > dstLength) return false; 1327 currentChar -= 0x10000; 1328 *(dstBuffer++) = (UTF16Char)((currentChar >> 10) + 0xD800UL); 1329 *(dstBuffer++) = (UTF16Char)((currentChar & 0x3FF) + 0xDC00UL); 1330 } 1331 } else { 1332 ++usedLength; 1333 if (dstLength) { 1334 if (usedLength > dstLength) return false; 1335 *(dstBuffer++) = (UTF16Char)currentChar; 1336 } 1337 } 1338 } 1339 1340 *dst = dstBuffer; 1341 } else if (dstFormat == kCFUniCharUTF8Format) { 1342 uint8_t *dstBuffer = (uint8_t *)*dst; 1343 uint16_t bytesToWrite = 0; 1344 const UTF32Char byteMask = 0xBF; 1345 const UTF32Char byteMark = 0x80; 1346 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 1347 1348 while (srcLength-- > 0) { 1349 currentChar = *(src++); 1350 1351 /* Figure out how many bytes the result will require */ 1352 if (currentChar < (UTF32Char)0x80) { 1353 bytesToWrite = 1; 1354 } else if (currentChar < (UTF32Char)0x800) { 1355 bytesToWrite = 2; 1356 } else if (currentChar < (UTF32Char)0x10000) { 1357 bytesToWrite = 3; 1358 } else if (currentChar < (UTF32Char)0x200000) { 1359 bytesToWrite = 4; 1360 } else { 1361 bytesToWrite = 2; 1362 currentChar = UNI_REPLACEMENT_CHAR; 1363 } 1364 1365 usedLength += bytesToWrite; 1366 1367 if (dstLength) { 1368 if (usedLength > dstLength) return false; 1369 1370 dstBuffer += bytesToWrite; 1371 switch (bytesToWrite) { /* note: everything falls through. */ 1372 case 4: *--dstBuffer = (currentChar | byteMark) & byteMask; currentChar >>= 6; 1373 case 3: *--dstBuffer = (currentChar | byteMark) & byteMask; currentChar >>= 6; 1374 case 2: *--dstBuffer = (currentChar | byteMark) & byteMask; currentChar >>= 6; 1375 case 1: *--dstBuffer = currentChar | firstByteMark[bytesToWrite]; 1376 } 1377 dstBuffer += bytesToWrite; 1378 } 1379 } 1380 1381 *dst = dstBuffer; 1382 } else { 1383 UTF32Char *dstBuffer = (UTF32Char *)*dst; 1384 1385 while (srcLength-- > 0) { 1386 currentChar = *(src++); 1387 1388 ++usedLength; 1389 if (dstLength) { 1390 if (usedLength > dstLength) return false; 1391 *(dstBuffer++) = currentChar; 1392 } 1393 } 1394 1395 *dst = dstBuffer; 1396 } 1397 1398 *filledLength = usedLength; 1399 1400 return true; 1401} 1402 1403#if DEPLOYMENT_TARGET_WINDOWS 1404void __CFUniCharCleanup(void) 1405{ 1406 int idx; 1407 1408 // cleanup memory allocated by __CFUniCharLoadBitmapData() 1409 __CFSpinLock(&__CFUniCharBitmapLock); 1410 1411 if (__CFUniCharBitmapDataArray != NULL) { 1412 for (idx = 0; idx < (int)__CFUniCharNumberOfBitmaps; idx++) { 1413 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharBitmapDataArray[idx]._planes); 1414 __CFUniCharBitmapDataArray[idx]._planes = NULL; 1415 } 1416 1417 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharBitmapDataArray); 1418 __CFUniCharBitmapDataArray = NULL; 1419 __CFUniCharNumberOfBitmaps = 0; 1420 } 1421 1422 __CFSpinUnlock(&__CFUniCharBitmapLock); 1423 1424 // cleanup memory allocated by CFUniCharGetMappingData() 1425 __CFSpinLock(&__CFUniCharMappingTableLock); 1426 1427 if (__CFUniCharMappingTables != NULL) { 1428 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharMappingTables); 1429 __CFUniCharMappingTables = NULL; 1430 } 1431 1432 // cleanup memory allocated by __CFUniCharLoadCaseMappingTable() 1433 if (__CFUniCharCaseMappingTableCounts != NULL) { 1434 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharCaseMappingTableCounts); 1435 __CFUniCharCaseMappingTableCounts = NULL; 1436 1437 __CFUniCharCaseMappingTable = NULL; 1438 __CFUniCharCaseMappingExtraTable = NULL; 1439 } 1440 1441 __CFSpinUnlock(&__CFUniCharMappingTableLock); 1442 1443 // cleanup memory allocated by CFUniCharGetUnicodePropertyDataForPlane() 1444 __CFSpinLock(&__CFUniCharPropTableLock); 1445 1446 if (__CFUniCharUnicodePropertyTable != NULL) { 1447 for (idx = 0; idx < __CFUniCharUnicodePropertyTableCount; idx++) { 1448 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharUnicodePropertyTable[idx]._planes); 1449 __CFUniCharUnicodePropertyTable[idx]._planes = NULL; 1450 } 1451 1452 CFAllocatorDeallocate(kCFAllocatorSystemDefault, __CFUniCharUnicodePropertyTable); 1453 __CFUniCharUnicodePropertyTable = NULL; 1454 __CFUniCharUnicodePropertyTableCount = 0; 1455 } 1456 1457 __CFSpinUnlock(&__CFUniCharPropTableLock); 1458} 1459#endif 1460 1461#undef USE_MACHO_SEGMENT 1462 1463