1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFICUConverters.c
25	Copyright (c) 2004-2013, Apple Inc. All rights reserved.
26	Responsibility: Aki Inoue
27*/
28
29#include "CFStringEncodingDatabase.h"
30#include "CFStringEncodingConverterPriv.h"
31#include "CFICUConverters.h"
32#include <CoreFoundation/CFStringEncodingExt.h>
33#include <CoreFoundation/CFUniChar.h>
34#include <unicode/ucnv.h>
35#include <unicode/uversion.h>
36#include "CFInternal.h"
37#include <stdio.h>
38
39// Thread data support
40typedef struct {
41    uint8_t _numSlots;
42    uint8_t _nextSlot;
43    UConverter **_converters;
44} __CFICUThreadData;
45
46static void __CFICUThreadDataDestructor(void *context) {
47    __CFICUThreadData * data = (__CFICUThreadData *)context;
48
49    if (NULL != data->_converters) { // scan to make sure deallocation
50        UConverter **converter = data->_converters;
51        UConverter **limit = converter + data->_numSlots;
52
53        while (converter < limit) {
54            if (NULL != converter) ucnv_close(*converter);
55            ++converter;
56        }
57        CFAllocatorDeallocate(NULL, data->_converters);
58    }
59
60    CFAllocatorDeallocate(NULL, data);
61}
62
63CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
64    __CFICUThreadData * data;
65
66    data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter);
67
68    if (NULL == data) {
69        data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
70        memset(data, 0, sizeof(__CFICUThreadData));
71        _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor);
72    }
73
74    return data;
75}
76
77CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
78#define STACK_BUFFER_SIZE (60)
79    char buffer[STACK_BUFFER_SIZE];
80    const char *result = NULL;
81    UErrorCode errorCode = U_ZERO_ERROR;
82    uint32_t codepage = 0;
83
84    if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
85
86    if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
87
88    if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
89
90    if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
91
92    return result;
93#undef STACK_BUFFER_SIZE
94}
95
96CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
97    uint32_t codepage;
98    char *endPtr;
99    UErrorCode errorCode = U_ZERO_ERROR;
100
101    if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
102
103    if (0 != ucnv_countAliases(icuName, &errorCode)) {
104        CFStringEncoding encoding;
105        const char *name;
106
107        // Try WINDOWS platform
108        name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
109
110        if (NULL != name) {
111            if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
112
113            if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
114        }
115
116        // Try JAVA platform
117        name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
118        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
119
120        // Try MIME platform
121        name = ucnv_getStandardName(icuName, "MIME", &errorCode);
122        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
123    }
124
125    return kCFStringEncodingInvalidId;
126}
127
128CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
129    UConverter *converter;
130    UErrorCode errorCode = U_ZERO_ERROR;
131    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
132
133    if (0 != streamID) { // this is a part of streaming previously created
134        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
135
136        --streamID; // map to array index
137
138        if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
139    }
140
141    converter = ucnv_open(icuName, &errorCode);
142
143    if (NULL != converter) {
144        char lossyByte = CFStringEncodingMaskToLossyByte(flags);
145
146        if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
147
148        if (0 ==lossyByte) {
149            if (toUnicode) {
150                ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
151            } else {
152                ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
153            }
154        } else {
155            ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
156        }
157    }
158
159    return converter;
160}
161
162#define ICU_CONVERTER_SLOT_INCREMENT (10)
163#define ICU_CONVERTER_MAX_SLOT (255)
164
165static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
166    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
167
168    if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
169        if (0 == streamID) {
170            __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
171
172            if (NULL == data->_converters) {
173                data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
174                memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
175                data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
176                data->_nextSlot = 0;
177            } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
178                CFIndex index;
179
180                for (index = 0;index < data->_numSlots;index++) {
181                    if (NULL == data->_converters[index]) {
182                        data->_nextSlot = index;
183                        break;
184                    }
185                }
186
187                if (index >= data->_numSlots) { // we're full
188                    UConverter **newConverters;
189                    CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
190
191                    if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
192                        CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
193                        ucnv_close(converter);
194                        return 0;
195                    }
196
197                    newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
198                    memset(newConverters, 0, sizeof(UConverter *) * newSize);
199                    memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
200                    CFAllocatorDeallocate(NULL, data->_converters);
201                    data->_converters = newConverters;
202                    data->_nextSlot = data->_numSlots;
203                    data->_numSlots = newSize;
204                }
205            }
206
207            data->_converters[data->_nextSlot] = converter;
208            streamID = data->_nextSlot + 1;
209
210            // now find next slot
211            ++data->_nextSlot;
212
213            if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
214                data->_nextSlot = 0;
215
216                while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
217            }
218        }
219
220        return CFStringEncodingStreamIDToMask(streamID);
221    }
222
223    if (0 != streamID) {
224        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
225
226        --streamID; // map to array index
227
228        if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
229            data->_converters[streamID] = NULL;
230            if (data->_nextSlot > streamID) data->_nextSlot = streamID;
231        }
232    }
233
234    ucnv_close(converter);
235
236    return 0;
237}
238
239#define MAX_BUFFER_SIZE (1000)
240
241#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
242#if 0
243// we're no longer doing this check. Revive when the status in the bug changed.
244#if (U_ICU_VERSION_MAJOR_NUM > 49)
245#warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
246#endif
247#endif
248#endif
249#define HAS_ICU_BUG_6024743 (1)
250#define HAS_ICU_BUG_6025527 (1)
251
252CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
253    UConverter *converter;
254    UErrorCode errorCode = U_ZERO_ERROR;
255    const UTF16Char *source = characters;
256    const UTF16Char *sourceLimit = source + numChars;
257    char *destination = (char *)bytes;
258    const char *destinationLimit = destination + maxByteLen;
259    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
260    CFIndex status;
261
262    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
263
264    if (0 == maxByteLen) {
265        char buffer[MAX_BUFFER_SIZE];
266        CFIndex totalLength = 0;
267
268        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
269            destination = buffer;
270            destinationLimit = destination + MAX_BUFFER_SIZE;
271
272            ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
273
274            totalLength += (destination - buffer);
275
276            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
277        }
278
279        if (NULL != usedByteLen) *usedByteLen = totalLength;
280    } else {
281        ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
282
283#if HAS_ICU_BUG_6024743
284/* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
285        if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
286            const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
287            const uint8_t *nonBase;
288            UTF32Char character;
289
290            do {
291                // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
292                do {
293                    sourceLimit = (source - 1);
294                    character = *sourceLimit;
295                    nonBase = bitmap;
296
297                    if (CFUniCharIsSurrogateLowCharacter(character)) {
298                        --sourceLimit;
299                        character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
300                        nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
301                        character &= 0xFFFF;
302                    }
303                } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));
304
305                if (sourceLimit > characters) {
306                    source = characters;
307                    destination = (char *)bytes;
308                    errorCode = U_ZERO_ERROR;
309
310                    ucnv_resetFromUnicode(converter);
311
312                    ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
313                }
314            } while (U_BUFFER_OVERFLOW_ERROR == errorCode);
315
316            errorCode = U_BUFFER_OVERFLOW_ERROR;
317        }
318#endif
319        if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
320    }
321
322    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
323
324    if (NULL != usedCharLen) {
325#if HAS_ICU_BUG_6024743
326/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
327	if (kCFStringEncodingInvalidInputStream == status) {
328#define MAX_ERROR_BUFFER_LEN (32)
329	    UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
330	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
331#undef MAX_ERROR_BUFFER_LEN
332
333	    errorCode = U_ZERO_ERROR;
334
335	    ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
336
337	    if (U_ZERO_ERROR == errorCode) {
338		source -= errorLength;
339	    } else {
340		// Gah, something is terribly wrong. Reset everything
341		source = characters; // 0 length
342		if (NULL != usedByteLen) *usedByteLen = 0;
343	    }
344	}
345#endif
346	*usedCharLen = source - characters;
347    }
348
349    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
350
351    return status;
352}
353
354CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
355    UConverter *converter;
356    UErrorCode errorCode = U_ZERO_ERROR;
357    const char *source = (const char *)bytes;
358    const char *sourceLimit = source + numBytes;
359    UTF16Char *destination = characters;
360    const UTF16Char *destinationLimit = destination + maxCharLen;
361    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
362    CFIndex status;
363
364    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
365
366    if (0 == maxCharLen) {
367        UTF16Char buffer[MAX_BUFFER_SIZE];
368        CFIndex totalLength = 0;
369
370        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
371            destination = buffer;
372            destinationLimit = destination + MAX_BUFFER_SIZE;
373
374            ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
375
376            totalLength += (destination - buffer);
377
378            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
379        }
380
381        if (NULL != usedCharLen) *usedCharLen = totalLength;
382    } else {
383        ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
384
385        if (NULL != usedCharLen) *usedCharLen = destination - characters;
386    }
387
388    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
389
390    if (NULL != usedByteLen) {
391#if HAS_ICU_BUG_6024743
392	/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
393	if (kCFStringEncodingInvalidInputStream == status) {
394#define MAX_ERROR_BUFFER_LEN (32)
395	    char errorBuffer[MAX_ERROR_BUFFER_LEN];
396	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
397#undef MAX_ERROR_BUFFER_LEN
398
399	    errorCode = U_ZERO_ERROR;
400
401	    ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
402
403	    if (U_ZERO_ERROR == errorCode) {
404#if HAS_ICU_BUG_6025527
405                // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
406                if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
407#endif
408		source -= errorLength;
409	    } else {
410		// Gah, something is terribly wrong. Reset everything
411		source = (const char *)bytes; // 0 length
412		if (NULL != usedCharLen) *usedCharLen = 0;
413	    }
414	}
415#endif
416
417	*usedByteLen = source - (const char *)bytes;
418    }
419
420    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
421
422    return status;
423}
424
425CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
426    CFIndex usedCharLen;
427    return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
428}
429
430CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
431    CFIndex usedByteLen;
432    return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
433}
434
435CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
436    CFIndex count = ucnv_countAvailable();
437    CFIndex numEncodings = 0;
438    CFStringEncoding *encodings;
439    CFStringEncoding encoding;
440    CFIndex index;
441
442    if (0 == count) return NULL;
443
444    encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
445
446    for (index = 0;index < count;index++) {
447        encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
448
449        if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
450    }
451
452    if (0 == numEncodings) {
453        CFAllocatorDeallocate(allocator, encodings);
454        encodings = NULL;
455    }
456
457    *numberOfIndex = numEncodings;
458
459    return encodings;
460}
461