1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*    CFUnicodeDecomposition.c
25    Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26    Responsibility: Aki Inoue
27*/
28
29#include <string.h>
30#include <CoreFoundation/CFBase.h>
31#include <CoreFoundation/CFCharacterSet.h>
32#include <CoreFoundation/CFUniChar.h>
33#include <CoreFoundation/CFUnicodeDecomposition.h>
34#include "CFInternal.h"
35#include "CFUniCharPriv.h"
36
37// Canonical Decomposition
38static UTF32Char *__CFUniCharDecompositionTable = NULL;
39static uint32_t __CFUniCharDecompositionTableLength = 0;
40static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL;
41
42static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL;
43static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL;
44
45static CFSpinLock_t __CFUniCharDecompositionTableLock = CFSpinLockInit;
46
47static const uint8_t **__CFUniCharCombiningPriorityTable = NULL;
48static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0;
49
50static void __CFUniCharLoadDecompositionTable(void) {
51
52    __CFSpinLock(&__CFUniCharDecompositionTableLock);
53
54    if (NULL == __CFUniCharDecompositionTable) {
55        const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping);
56
57        if (NULL == bytes) {
58            __CFSpinUnlock(&__CFUniCharDecompositionTableLock);
59            return;
60        }
61
62        __CFUniCharDecompositionTableLength = *(bytes++);
63        __CFUniCharDecompositionTable = (UTF32Char *)bytes;
64        __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength);
65
66        __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2);
67        __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0);
68        __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0);
69
70        CFIndex idx;
71
72        __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty);
73        __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0);
74        for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx);
75    }
76
77    __CFSpinUnlock(&__CFUniCharDecompositionTableLock);
78}
79
80static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = CFSpinLockInit;
81static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL;
82static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0;
83static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL;
84
85static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
86
87    __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock);
88
89    if (NULL == __CFUniCharCompatibilityDecompositionTable) {
90        const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping);
91
92        if (NULL == bytes) {
93            __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
94            return;
95        }
96
97        __CFUniCharCompatibilityDecompositionTableLength = *(bytes++);
98        __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes;
99        __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength);
100
101        __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2);
102    }
103
104    __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
105}
106
107CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) {
108    return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF))));
109}
110
111CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); }
112
113CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class
114
115typedef struct {
116    uint32_t _key;
117    uint32_t _value;
118} __CFUniCharDecomposeMappings;
119
120static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
121    const __CFUniCharDecomposeMappings *p, *q, *divider;
122
123    if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
124        return 0;
125    }
126    p = theTable;
127    q = p + (numElem-1);
128    while (p <= q) {
129        divider = p + ((q - p) >> 1);    /* divide by 2 */
130        if (character < divider->_key) { q = divider - 1; }
131        else if (character > divider->_key) { p = divider + 1; }
132        else { return divider->_value; }
133    }
134    return 0;
135}
136
137static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
138    UTF32Char *end = characters + length;
139
140    while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters;
141
142    if ((end - characters) > 1) {
143        uint32_t p1, p2;
144        UTF32Char *ch1, *ch2;
145        bool changes = true;
146
147        do {
148            changes = false;
149            ch1 = characters; ch2 = characters + 1;
150            p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1);
151            while (ch2 < end) {
152                p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2);
153                if (p1 > p2) {
154                    UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp;
155                    changes = true;
156                }
157                ++ch1; ++ch2;
158            }
159        } while (changes);
160    }
161}
162
163static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
164    uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character);
165    CFIndex length = CFUniCharConvertFlagToCount(value);
166    UTF32Char firstChar = value & 0xFFFFFF;
167    UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar);
168    CFIndex usedLength = 0;
169
170    if (maxBufferLength < length) return 0;
171
172    if (value & kCFUniCharRecursiveDecompositionFlag) {
173        usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length);
174
175        --length; // Decrement for the first char
176        if (!usedLength || usedLength + length > maxBufferLength) return 0;
177        ++mappings;
178        convertedChars += usedLength;
179    }
180
181    usedLength += length;
182
183    while (length--) *(convertedChars++) = *(mappings++);
184
185    return usedLength;
186}
187
188#define HANGUL_SBASE 0xAC00
189#define HANGUL_LBASE 0x1100
190#define HANGUL_VBASE 0x1161
191#define HANGUL_TBASE 0x11A7
192#define HANGUL_SCOUNT 11172
193#define HANGUL_LCOUNT 19
194#define HANGUL_VCOUNT 21
195#define HANGUL_TCOUNT 28
196#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
197
198CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
199    if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
200    if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) {
201        CFIndex length;
202
203        character -= HANGUL_SBASE;
204
205        length = (character % HANGUL_TCOUNT ? 3 : 2);
206
207        if (maxBufferLength < length) return 0;
208
209        *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE;
210        *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
211        if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
212        return length;
213    } else {
214        return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength);
215    }
216}
217
218CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) {
219    if (length > 1) __CFUniCharPrioritySort(buffer, length);
220    return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat);
221}
222
223#define MAX_BUFFER_LENGTH (32)
224bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) {
225    CFIndex usedLength = 0;
226    CFIndex originalLength = length;
227    UTF32Char buffer[MAX_BUFFER_LENGTH];
228    UTF32Char *decompBuffer = buffer;
229    CFIndex decompBufferSize = MAX_BUFFER_LENGTH;
230    CFIndex decompBufferLen = 0;
231    CFIndex segmentLength = 0;
232    UTF32Char currentChar;
233
234    if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
235
236    while ((length - segmentLength) > 0) {
237        currentChar = *(src++);
238
239        if (currentChar < 0x80) {
240            if (decompBufferLen > 0) {
241                if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
242                length -= segmentLength;
243                segmentLength = 0;
244                decompBufferLen = 0;
245            }
246
247            if (maxLength > 0) {
248                if (usedLength >= maxLength) break;
249                switch (dstFormat) {
250                case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break;
251                case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break;
252                case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break;
253                }
254            }
255
256            --length;
257            ++usedLength;
258        } else {
259            if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte
260                if (dstFormat != kCFUniCharUTF16Format) break;
261            } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) {
262                if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) {
263                    currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++));
264                } else {
265                    if (dstFormat != kCFUniCharUTF16Format) break;
266                }
267            }
268
269            if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) {
270                if ((decompBufferLen + 1) >= decompBufferSize) {
271                    UTF32Char *newBuffer;
272
273                    decompBufferSize += MAX_BUFFER_LENGTH;
274                    newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0);
275                    memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char));
276                    if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
277                    decompBuffer = newBuffer;
278                }
279
280                if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc.
281                    decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen);
282                } else {
283                    decompBuffer[decompBufferLen++] = currentChar;
284                }
285            } else {
286                if (decompBufferLen > 0) {
287                    if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
288                    length -= segmentLength;
289                    segmentLength = 0;
290                }
291
292                if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) {
293                    decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH);
294                } else {
295                    decompBufferLen = 1;
296                    *decompBuffer = currentChar;
297                }
298
299                if (!needToReorder || (decompBufferLen == 1)) {
300                    if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
301                    length -= ((currentChar > 0xFFFF) ? 2 : 1);
302                    decompBufferLen = 0;
303                    continue;
304                }
305            }
306
307            segmentLength += ((currentChar > 0xFFFF) ? 2 : 1);
308        }
309    }
310
311    if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength;
312
313    if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
314
315    if (consumedLength) *consumedLength = originalLength - length;
316    if (filledLength) *filledLength = usedLength;
317
318    return ((length > 0) ? false : true);
319}
320
321#define MAX_COMP_DECOMP_LEN (32)
322
323static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) {
324    uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character);
325    CFIndex length = CFUniCharConvertFlagToCount(value);
326    UTF32Char firstChar = value & 0xFFFFFF;
327    const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar);
328    CFIndex usedLength = length;
329    UTF32Char currentChar;
330    CFIndex currentLength;
331
332    while (length-- > 0) {
333        currentChar = *(mappings++);
334        if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) {
335            currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length);
336            convertedChars += currentLength;
337            usedLength += (currentLength - 1);
338        } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
339            currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars);
340            convertedChars += currentLength;
341            usedLength += (currentLength - 1);
342        } else {
343            *(convertedChars++) = currentChar;
344        }
345    }
346
347    return usedLength;
348}
349
350CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) {
351    const UTF32Char *limit = convertedChars;
352    UTF32Char *dstP;
353
354    convertedChars += length;
355    dstP = convertedChars + delta;
356
357    while (convertedChars > limit) *(--dstP) = *(--convertedChars);
358}
359
360CF_PRIVATE CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) {
361    UTF32Char currentChar;
362    UTF32Char buffer[MAX_COMP_DECOMP_LEN];
363    const UTF32Char *bufferP;
364    const UTF32Char *limit = convertedChars + length;
365    CFIndex filledLength;
366
367    if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable();
368
369    while (convertedChars < limit) {
370        currentChar = *convertedChars;
371
372        if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
373            filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer);
374
375            if (filledLength + length - 1 > maxBufferLength) return 0;
376
377            if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1);
378
379            bufferP = buffer;
380            length += (filledLength - 1);
381            while (filledLength-- > 0) *(convertedChars++) = *(bufferP++);
382        } else {
383            ++convertedChars;
384        }
385    }
386
387    return length;
388}
389
390CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
391    __CFUniCharPrioritySort(characters, length);
392}
393
394#undef MAX_BUFFER_LENGTH
395#undef MAX_COMP_DECOMP_LEN
396#undef HANGUL_SBASE
397#undef HANGUL_LBASE
398#undef HANGUL_VBASE
399#undef HANGUL_TBASE
400#undef HANGUL_SCOUNT
401#undef HANGUL_LCOUNT
402#undef HANGUL_VCOUNT
403#undef HANGUL_TCOUNT
404#undef HANGUL_NCOUNT
405
406