1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFUnicodeDecomposition.c 25 Copyright (c) 1999-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include <string.h> 30#include <CoreFoundation/CFBase.h> 31#include <CoreFoundation/CFCharacterSet.h> 32#include <CoreFoundation/CFUniChar.h> 33#include <CoreFoundation/CFUnicodeDecomposition.h> 34#include "CFInternal.h" 35#include "CFUniCharPriv.h" 36 37// Canonical Decomposition 38static UTF32Char *__CFUniCharDecompositionTable = NULL; 39static uint32_t __CFUniCharDecompositionTableLength = 0; 40static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL; 41 42static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL; 43static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL; 44 45static CFSpinLock_t __CFUniCharDecompositionTableLock = CFSpinLockInit; 46 47static const uint8_t **__CFUniCharCombiningPriorityTable = NULL; 48static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0; 49 50static void __CFUniCharLoadDecompositionTable(void) { 51 52 __CFSpinLock(&__CFUniCharDecompositionTableLock); 53 54 if (NULL == __CFUniCharDecompositionTable) { 55 const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping); 56 57 if (NULL == bytes) { 58 __CFSpinUnlock(&__CFUniCharDecompositionTableLock); 59 return; 60 } 61 62 __CFUniCharDecompositionTableLength = *(bytes++); 63 __CFUniCharDecompositionTable = (UTF32Char *)bytes; 64 __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength); 65 66 __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2); 67 __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0); 68 __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0); 69 70 CFIndex idx; 71 72 __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty); 73 __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0); 74 for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx); 75 } 76 77 __CFSpinUnlock(&__CFUniCharDecompositionTableLock); 78} 79 80static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = CFSpinLockInit; 81static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL; 82static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0; 83static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL; 84 85static void __CFUniCharLoadCompatibilityDecompositionTable(void) { 86 87 __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock); 88 89 if (NULL == __CFUniCharCompatibilityDecompositionTable) { 90 const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping); 91 92 if (NULL == bytes) { 93 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); 94 return; 95 } 96 97 __CFUniCharCompatibilityDecompositionTableLength = *(bytes++); 98 __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes; 99 __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength); 100 101 __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2); 102 } 103 104 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); 105} 106 107CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) { 108 return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF)))); 109} 110 111CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); } 112 113CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class 114 115typedef struct { 116 uint32_t _key; 117 uint32_t _value; 118} __CFUniCharDecomposeMappings; 119 120static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) { 121 const __CFUniCharDecomposeMappings *p, *q, *divider; 122 123 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) { 124 return 0; 125 } 126 p = theTable; 127 q = p + (numElem-1); 128 while (p <= q) { 129 divider = p + ((q - p) >> 1); /* divide by 2 */ 130 if (character < divider->_key) { q = divider - 1; } 131 else if (character > divider->_key) { p = divider + 1; } 132 else { return divider->_value; } 133 } 134 return 0; 135} 136 137static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { 138 UTF32Char *end = characters + length; 139 140 while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters; 141 142 if ((end - characters) > 1) { 143 uint32_t p1, p2; 144 UTF32Char *ch1, *ch2; 145 bool changes = true; 146 147 do { 148 changes = false; 149 ch1 = characters; ch2 = characters + 1; 150 p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1); 151 while (ch2 < end) { 152 p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2); 153 if (p1 > p2) { 154 UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; 155 changes = true; 156 } 157 ++ch1; ++ch2; 158 } 159 } while (changes); 160 } 161} 162 163static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { 164 uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character); 165 CFIndex length = CFUniCharConvertFlagToCount(value); 166 UTF32Char firstChar = value & 0xFFFFFF; 167 UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar); 168 CFIndex usedLength = 0; 169 170 if (maxBufferLength < length) return 0; 171 172 if (value & kCFUniCharRecursiveDecompositionFlag) { 173 usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length); 174 175 --length; // Decrement for the first char 176 if (!usedLength || usedLength + length > maxBufferLength) return 0; 177 ++mappings; 178 convertedChars += usedLength; 179 } 180 181 usedLength += length; 182 183 while (length--) *(convertedChars++) = *(mappings++); 184 185 return usedLength; 186} 187 188#define HANGUL_SBASE 0xAC00 189#define HANGUL_LBASE 0x1100 190#define HANGUL_VBASE 0x1161 191#define HANGUL_TBASE 0x11A7 192#define HANGUL_SCOUNT 11172 193#define HANGUL_LCOUNT 19 194#define HANGUL_VCOUNT 21 195#define HANGUL_TCOUNT 28 196#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) 197 198CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { 199 if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); 200 if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) { 201 CFIndex length; 202 203 character -= HANGUL_SBASE; 204 205 length = (character % HANGUL_TCOUNT ? 3 : 2); 206 207 if (maxBufferLength < length) return 0; 208 209 *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE; 210 *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE; 211 if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE; 212 return length; 213 } else { 214 return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength); 215 } 216} 217 218CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) { 219 if (length > 1) __CFUniCharPrioritySort(buffer, length); 220 return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat); 221} 222 223#define MAX_BUFFER_LENGTH (32) 224bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) { 225 CFIndex usedLength = 0; 226 CFIndex originalLength = length; 227 UTF32Char buffer[MAX_BUFFER_LENGTH]; 228 UTF32Char *decompBuffer = buffer; 229 CFIndex decompBufferSize = MAX_BUFFER_LENGTH; 230 CFIndex decompBufferLen = 0; 231 CFIndex segmentLength = 0; 232 UTF32Char currentChar; 233 234 if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); 235 236 while ((length - segmentLength) > 0) { 237 currentChar = *(src++); 238 239 if (currentChar < 0x80) { 240 if (decompBufferLen > 0) { 241 if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; 242 length -= segmentLength; 243 segmentLength = 0; 244 decompBufferLen = 0; 245 } 246 247 if (maxLength > 0) { 248 if (usedLength >= maxLength) break; 249 switch (dstFormat) { 250 case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break; 251 case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break; 252 case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break; 253 } 254 } 255 256 --length; 257 ++usedLength; 258 } else { 259 if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte 260 if (dstFormat != kCFUniCharUTF16Format) break; 261 } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) { 262 if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) { 263 currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++)); 264 } else { 265 if (dstFormat != kCFUniCharUTF16Format) break; 266 } 267 } 268 269 if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) { 270 if ((decompBufferLen + 1) >= decompBufferSize) { 271 UTF32Char *newBuffer; 272 273 decompBufferSize += MAX_BUFFER_LENGTH; 274 newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0); 275 memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char)); 276 if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); 277 decompBuffer = newBuffer; 278 } 279 280 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc. 281 decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen); 282 } else { 283 decompBuffer[decompBufferLen++] = currentChar; 284 } 285 } else { 286 if (decompBufferLen > 0) { 287 if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; 288 length -= segmentLength; 289 segmentLength = 0; 290 } 291 292 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { 293 decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH); 294 } else { 295 decompBufferLen = 1; 296 *decompBuffer = currentChar; 297 } 298 299 if (!needToReorder || (decompBufferLen == 1)) { 300 if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; 301 length -= ((currentChar > 0xFFFF) ? 2 : 1); 302 decompBufferLen = 0; 303 continue; 304 } 305 } 306 307 segmentLength += ((currentChar > 0xFFFF) ? 2 : 1); 308 } 309 } 310 311 if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength; 312 313 if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); 314 315 if (consumedLength) *consumedLength = originalLength - length; 316 if (filledLength) *filledLength = usedLength; 317 318 return ((length > 0) ? false : true); 319} 320 321#define MAX_COMP_DECOMP_LEN (32) 322 323static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) { 324 uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character); 325 CFIndex length = CFUniCharConvertFlagToCount(value); 326 UTF32Char firstChar = value & 0xFFFFFF; 327 const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar); 328 CFIndex usedLength = length; 329 UTF32Char currentChar; 330 CFIndex currentLength; 331 332 while (length-- > 0) { 333 currentChar = *(mappings++); 334 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) { 335 currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length); 336 convertedChars += currentLength; 337 usedLength += (currentLength - 1); 338 } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { 339 currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars); 340 convertedChars += currentLength; 341 usedLength += (currentLength - 1); 342 } else { 343 *(convertedChars++) = currentChar; 344 } 345 } 346 347 return usedLength; 348} 349 350CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) { 351 const UTF32Char *limit = convertedChars; 352 UTF32Char *dstP; 353 354 convertedChars += length; 355 dstP = convertedChars + delta; 356 357 while (convertedChars > limit) *(--dstP) = *(--convertedChars); 358} 359 360CF_PRIVATE CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) { 361 UTF32Char currentChar; 362 UTF32Char buffer[MAX_COMP_DECOMP_LEN]; 363 const UTF32Char *bufferP; 364 const UTF32Char *limit = convertedChars + length; 365 CFIndex filledLength; 366 367 if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable(); 368 369 while (convertedChars < limit) { 370 currentChar = *convertedChars; 371 372 if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { 373 filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer); 374 375 if (filledLength + length - 1 > maxBufferLength) return 0; 376 377 if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1); 378 379 bufferP = buffer; 380 length += (filledLength - 1); 381 while (filledLength-- > 0) *(convertedChars++) = *(bufferP++); 382 } else { 383 ++convertedChars; 384 } 385 } 386 387 return length; 388} 389 390CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { 391 __CFUniCharPrioritySort(characters, length); 392} 393 394#undef MAX_BUFFER_LENGTH 395#undef MAX_COMP_DECOMP_LEN 396#undef HANGUL_SBASE 397#undef HANGUL_LBASE 398#undef HANGUL_VBASE 399#undef HANGUL_TBASE 400#undef HANGUL_SCOUNT 401#undef HANGUL_LCOUNT 402#undef HANGUL_VCOUNT 403#undef HANGUL_TCOUNT 404#undef HANGUL_NCOUNT 405 406