1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFUnicodePrecomposition.c
25	Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26	Responsibility: Aki Inoue
27*/
28
29#include <string.h>
30#include <CoreFoundation/CFBase.h>
31#include <CoreFoundation/CFCharacterSet.h>
32#include "CFUniChar.h"
33#include "CFUnicodePrecomposition.h"
34#include "CFInternal.h"
35#include "CFUniCharPriv.h"
36
37// Canonical Precomposition
38static UTF32Char *__CFUniCharPrecompSourceTable = NULL;
39static uint32_t __CFUniCharPrecompositionTableLength = 0;
40static uint16_t *__CFUniCharBMPPrecompDestinationTable = NULL;
41static uint32_t *__CFUniCharNonBMPPrecompDestinationTable = NULL;
42
43static const uint8_t *__CFUniCharNonBaseBitmapForBMP_P = NULL; // Adding _P so the symbol name is different from the one in CFUnicodeDecomposition.c
44static const uint8_t *__CFUniCharCombiningClassForBMP = NULL;
45
46static CFSpinLock_t __CFUniCharPrecompositionTableLock = CFSpinLockInit;
47
48static void __CFUniCharLoadPrecompositionTable(void) {
49
50    __CFSpinLock(&__CFUniCharPrecompositionTableLock);
51
52    if (NULL == __CFUniCharPrecompSourceTable) {
53        const uint32_t *bytes = (const uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalPrecompMapping);
54        uint32_t bmpMappingLength;
55
56        if (NULL == bytes) {
57            __CFSpinUnlock(&__CFUniCharPrecompositionTableLock);
58            return;
59        }
60
61        __CFUniCharPrecompositionTableLength = *(bytes++);
62        bmpMappingLength = *(bytes++);
63        __CFUniCharPrecompSourceTable = (UTF32Char *)bytes;
64        __CFUniCharBMPPrecompDestinationTable = (uint16_t *)((intptr_t)bytes + (__CFUniCharPrecompositionTableLength * sizeof(UTF32Char) * 2));
65        __CFUniCharNonBMPPrecompDestinationTable = (uint32_t *)(((intptr_t)__CFUniCharBMPPrecompDestinationTable) + bmpMappingLength);
66
67        __CFUniCharNonBaseBitmapForBMP_P = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
68        __CFUniCharCombiningClassForBMP = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, 0);
69    }
70
71    __CFSpinUnlock(&__CFUniCharPrecompositionTableLock);
72}
73
74 // Adding _P so the symbol name is different from the one in CFUnicodeDecomposition.c
75#define __CFUniCharIsNonBaseCharacter	__CFUniCharIsNonBaseCharacter_P
76CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) {
77    return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? __CFUniCharNonBaseBitmapForBMP_P : CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, ((character >> 16) & 0xFF))));
78}
79
80typedef struct {
81    UTF16Char _key;
82    UTF16Char _value;
83} __CFUniCharPrecomposeBMPMappings;
84
85static UTF16Char __CFUniCharGetMappedBMPValue(const __CFUniCharPrecomposeBMPMappings *theTable, uint32_t numElem, UTF16Char character) {
86    const __CFUniCharPrecomposeBMPMappings *p, *q, *divider;
87
88    if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
89        return 0;
90    }
91    p = theTable;
92    q = p + (numElem-1);
93    while (p <= q) {
94        divider = p + ((q - p) >> 1);	/* divide by 2 */
95        if (character < divider->_key) { q = divider - 1; }
96        else if (character > divider->_key) { p = divider + 1; }
97        else { return divider->_value; }
98    }
99    return 0;
100}
101
102typedef struct {
103    UTF32Char _key;
104    uint32_t _value;
105} __CFUniCharPrecomposeMappings;
106
107static uint32_t __CFUniCharGetMappedValue_P(const __CFUniCharPrecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
108    const __CFUniCharPrecomposeMappings *p, *q, *divider;
109
110    if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
111        return 0;
112    }
113    p = theTable;
114    q = p + (numElem-1);
115    while (p <= q) {
116        divider = p + ((q - p) >> 1);	/* divide by 2 */
117        if (character < divider->_key) { q = divider - 1; }
118        else if (character > divider->_key) { p = divider + 1; }
119        else { return divider->_value; }
120    }
121    return 0;
122}
123
124CF_PRIVATE
125UTF32Char CFUniCharPrecomposeCharacter(UTF32Char base, UTF32Char combining) {
126    uint32_t value;
127
128    if (NULL == __CFUniCharPrecompSourceTable) __CFUniCharLoadPrecompositionTable();
129
130    if (!(value = __CFUniCharGetMappedValue_P((const __CFUniCharPrecomposeMappings *)__CFUniCharPrecompSourceTable, __CFUniCharPrecompositionTableLength, combining))) return 0xFFFD;
131
132    // We don't have precomposition in non-BMP
133    if (value & kCFUniCharNonBmpFlag) {
134        value = __CFUniCharGetMappedValue_P((const __CFUniCharPrecomposeMappings *)((uint32_t *)__CFUniCharNonBMPPrecompDestinationTable + (value & 0xFFFF)), (value >> 16) & 0x7FFF, base);
135    } else {
136        value = __CFUniCharGetMappedBMPValue((const __CFUniCharPrecomposeBMPMappings *)((uint32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)), (value >> 16), base);
137    }
138    return (value ? value : 0xFFFD);
139}
140
141#define HANGUL_SBASE 0xAC00
142#define HANGUL_LBASE 0x1100
143#define HANGUL_VBASE 0x1161
144#define HANGUL_TBASE 0x11A7
145#define HANGUL_SCOUNT 11172
146#define HANGUL_LCOUNT 19
147#define HANGUL_VCOUNT 21
148#define HANGUL_TCOUNT 28
149#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
150
151CF_INLINE void __CFUniCharMoveBufferFromEnd0(UTF16Char *convertedChars, CFIndex length, CFIndex delta) {
152    const UTF16Char *limit = convertedChars;
153    UTF16Char *dstP;
154
155    convertedChars += length;
156    dstP = convertedChars + delta;
157
158    while (convertedChars > limit) *(--dstP) = *(--convertedChars);
159}
160
161bool CFUniCharPrecompose(const UTF16Char *characters, CFIndex length, CFIndex *consumedLength, UTF16Char *precomposed, CFIndex maxLength, CFIndex *filledLength) {
162    UTF32Char currentChar = 0, lastChar = 0, precomposedChar = 0xFFFD;
163    CFIndex originalLength = length, usedLength = 0;
164    UTF16Char *currentBase = precomposed;
165    uint8_t currentClass, lastClass = 0;
166    bool currentBaseIsBMP = true;
167    bool isPrecomposed;
168
169    if (NULL == __CFUniCharPrecompSourceTable) __CFUniCharLoadPrecompositionTable();
170
171    while (length > 0) {
172        currentChar = *(characters++);
173        --length;
174
175        if (CFUniCharIsSurrogateHighCharacter(currentChar) && (length > 0) && CFUniCharIsSurrogateLowCharacter(*characters)) {
176            currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(characters++));
177            --length;
178        }
179
180        if (lastChar && __CFUniCharIsNonBaseCharacter(currentChar)) {
181            isPrecomposed = (precomposedChar == 0xFFFD ? false : true);
182            if (isPrecomposed) lastChar = precomposedChar;
183
184            currentClass = (currentChar > 0xFFFF ? CFUniCharGetUnicodeProperty(currentChar, kCFUniCharCombiningProperty) : CFUniCharGetCombiningPropertyForCharacter(currentChar, __CFUniCharCombiningClassForBMP));
185
186            if ((lastClass == 0) || (currentClass > lastClass)) {
187                if ((precomposedChar = CFUniCharPrecomposeCharacter(lastChar, currentChar)) == 0xFFFD) {
188                    if (isPrecomposed) precomposedChar = lastChar;
189                    lastClass = currentClass;
190                } else {
191                    continue;
192                }
193            }
194            if (currentChar > 0xFFFF) { // Non-BMP
195                usedLength += 2;
196                if (usedLength > maxLength) break;
197                currentChar -= 0x10000;
198                *(precomposed++) = (UTF16Char)((currentChar >> 10) + 0xD800UL);
199                *(precomposed++) = (UTF16Char)((currentChar & 0x3FF) + 0xDC00UL);
200            } else {
201                ++usedLength;
202                if (usedLength > maxLength) break;
203                *(precomposed++) = (UTF16Char)currentChar;
204            }
205        } else {
206            if ((currentChar >= HANGUL_LBASE) && (currentChar < (HANGUL_LBASE + 0xFF))) { // Hangul Jamo
207                int8_t lIndex = currentChar - HANGUL_LBASE;
208
209                if ((length > 0) && (0 <= lIndex) && (lIndex <= HANGUL_LCOUNT)) {
210                    int16_t vIndex = *characters - HANGUL_VBASE;
211
212                    if ((vIndex >= 0) && (vIndex <= HANGUL_VCOUNT)) {
213                        int16_t tIndex = 0;
214
215                        ++characters; --length;
216
217                        if (length > 0) {
218                            tIndex = *characters - HANGUL_TBASE;
219                            if ((tIndex < 0) || (tIndex > HANGUL_TCOUNT)) {
220                                tIndex = 0;
221                            } else {
222                                ++characters; --length;
223                            }
224                        }
225                        currentChar = (lIndex * HANGUL_VCOUNT + vIndex) * HANGUL_TCOUNT + tIndex + HANGUL_SBASE;
226                    }
227                }
228            }
229
230            if (precomposedChar != 0xFFFD) {
231                if (currentBaseIsBMP) { // Non-BMP
232                    if (lastChar > 0xFFFF) { // Last char was Non-BMP
233                        --usedLength;
234                        memmove(currentBase + 1, currentBase + 2, (precomposed - (currentBase + 2)) * sizeof(UTF16Char));
235                    }
236                    *(currentBase) = (UTF16Char)precomposedChar;
237                } else {
238                    if (lastChar < 0x10000) { // Last char was BMP
239                        ++usedLength;
240                        if (usedLength > maxLength) break;
241                        __CFUniCharMoveBufferFromEnd0(currentBase + 1, precomposed - (currentBase + 1), 1);
242                    }
243                    precomposedChar -= 0x10000;
244                    *currentBase = (UTF16Char)((precomposedChar >> 10) + 0xD800UL);
245                    *(currentBase + 1) = (UTF16Char)((precomposedChar & 0x3FF) + 0xDC00UL);
246                }
247                precomposedChar = 0xFFFD;
248            }
249            currentBase = precomposed;
250
251            lastChar = currentChar;
252            lastClass = 0;
253
254            if (currentChar > 0xFFFF) { // Non-BMP
255                usedLength += 2;
256                if (usedLength > maxLength) break;
257                currentChar -= 0x10000;
258                *(precomposed++) = (UTF16Char)((currentChar >> 10) + 0xD800UL);
259                *(precomposed++) = (UTF16Char)((currentChar & 0x3FF) + 0xDC00UL);
260                currentBaseIsBMP = false;
261            } else {
262                ++usedLength;
263                if (usedLength > maxLength) break;
264                *(precomposed++) = (UTF16Char)currentChar;
265                currentBaseIsBMP = true;
266            }
267        }
268    }
269
270    if (precomposedChar != 0xFFFD) {
271        if (currentChar > 0xFFFF) { // Non-BMP
272            if (lastChar < 0x10000) { // Last char was BMP
273                ++usedLength;
274                if (usedLength > maxLength) {
275                    if (consumedLength) *consumedLength = originalLength - length;
276                    if (filledLength) *filledLength = usedLength;
277                    return false;
278                }
279                __CFUniCharMoveBufferFromEnd0(currentBase + 1, precomposed - (currentBase + 1), 1);
280            }
281            precomposedChar -= 0x10000;
282            *currentBase = (UTF16Char)((precomposedChar >> 10) + 0xD800UL);
283            *(currentBase + 1) = (UTF16Char)((precomposedChar & 0x3FF) + 0xDC00UL);
284        } else {
285            if (lastChar > 0xFFFF) { // Last char was Non-BMP
286                --usedLength;
287                memmove(currentBase + 1, currentBase + 2, (precomposed - (currentBase + 2)) * sizeof(UTF16Char));
288            }
289            *(currentBase) = (UTF16Char)precomposedChar;
290        }
291    }
292
293    if (consumedLength) *consumedLength = originalLength - length;
294    if (filledLength) *filledLength = usedLength;
295
296    return true;
297}
298
299#undef __CFUniCharIsNonBaseCharacter
300#undef HANGUL_SBASE
301#undef HANGUL_LBASE
302#undef HANGUL_VBASE
303#undef HANGUL_TBASE
304#undef HANGUL_SCOUNT
305#undef HANGUL_LCOUNT
306#undef HANGUL_VCOUNT
307#undef HANGUL_TCOUNT
308#undef HANGUL_NCOUNT
309
310