1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFICUConverters.c 25 Copyright (c) 2004-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include "CFStringEncodingDatabase.h" 30#include "CFStringEncodingConverterPriv.h" 31#include "CFICUConverters.h" 32#include <CoreFoundation/CFStringEncodingExt.h> 33#include <CoreFoundation/CFUniChar.h> 34#include <unicode/ucnv.h> 35#include <unicode/uversion.h> 36#include "CFInternal.h" 37#include <stdio.h> 38 39// Thread data support 40typedef struct { 41 uint8_t _numSlots; 42 uint8_t _nextSlot; 43 UConverter **_converters; 44} __CFICUThreadData; 45 46static void __CFICUThreadDataDestructor(void *context) { 47 __CFICUThreadData * data = (__CFICUThreadData *)context; 48 49 if (NULL != data->_converters) { // scan to make sure deallocation 50 UConverter **converter = data->_converters; 51 UConverter **limit = converter + data->_numSlots; 52 53 while (converter < limit) { 54 if (NULL != converter) ucnv_close(*converter); 55 ++converter; 56 } 57 CFAllocatorDeallocate(NULL, data->_converters); 58 } 59 60 CFAllocatorDeallocate(NULL, data); 61} 62 63CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() { 64 __CFICUThreadData * data; 65 66 data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter); 67 68 if (NULL == data) { 69 data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0); 70 memset(data, 0, sizeof(__CFICUThreadData)); 71 _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor); 72 } 73 74 return data; 75} 76 77CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) { 78#define STACK_BUFFER_SIZE (60) 79 char buffer[STACK_BUFFER_SIZE]; 80 const char *result = NULL; 81 UErrorCode errorCode = U_ZERO_ERROR; 82 uint32_t codepage = 0; 83 84 if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name"; 85 86 if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows 87 88 if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result; 89 90 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode); 91 92 return result; 93#undef STACK_BUFFER_SIZE 94} 95 96CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) { 97 uint32_t codepage; 98 char *endPtr; 99 UErrorCode errorCode = U_ZERO_ERROR; 100 101 if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage); 102 103 if (0 != ucnv_countAliases(icuName, &errorCode)) { 104 CFStringEncoding encoding; 105 const char *name; 106 107 // Try WINDOWS platform 108 name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode); 109 110 if (NULL != name) { 111 if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage); 112 113 if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; 114 } 115 116 // Try JAVA platform 117 name = ucnv_getStandardName(icuName, "JAVA", &errorCode); 118 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; 119 120 // Try MIME platform 121 name = ucnv_getStandardName(icuName, "MIME", &errorCode); 122 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; 123 } 124 125 return kCFStringEncodingInvalidId; 126} 127 128CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) { 129 UConverter *converter; 130 UErrorCode errorCode = U_ZERO_ERROR; 131 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags); 132 133 if (0 != streamID) { // this is a part of streaming previously created 134 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); 135 136 --streamID; // map to array index 137 138 if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID]; 139 } 140 141 converter = ucnv_open(icuName, &errorCode); 142 143 if (NULL != converter) { 144 char lossyByte = CFStringEncodingMaskToLossyByte(flags); 145 146 if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?'; 147 148 if (0 ==lossyByte) { 149 if (toUnicode) { 150 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); 151 } else { 152 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); 153 } 154 } else { 155 ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode); 156 } 157 } 158 159 return converter; 160} 161 162#define ICU_CONVERTER_SLOT_INCREMENT (10) 163#define ICU_CONVERTER_MAX_SLOT (255) 164 165static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) { 166 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags); 167 168 if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) { 169 if (0 == streamID) { 170 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); 171 172 if (NULL == data->_converters) { 173 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0); 174 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT); 175 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT; 176 data->_nextSlot = 0; 177 } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one 178 CFIndex index; 179 180 for (index = 0;index < data->_numSlots;index++) { 181 if (NULL == data->_converters[index]) { 182 data->_nextSlot = index; 183 break; 184 } 185 } 186 187 if (index >= data->_numSlots) { // we're full 188 UConverter **newConverters; 189 CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT; 190 191 if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong 192 CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring...")); 193 ucnv_close(converter); 194 return 0; 195 } 196 197 newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0); 198 memset(newConverters, 0, sizeof(UConverter *) * newSize); 199 memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots); 200 CFAllocatorDeallocate(NULL, data->_converters); 201 data->_converters = newConverters; 202 data->_nextSlot = data->_numSlots; 203 data->_numSlots = newSize; 204 } 205 } 206 207 data->_converters[data->_nextSlot] = converter; 208 streamID = data->_nextSlot + 1; 209 210 // now find next slot 211 ++data->_nextSlot; 212 213 if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { 214 data->_nextSlot = 0; 215 216 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot; 217 } 218 } 219 220 return CFStringEncodingStreamIDToMask(streamID); 221 } 222 223 if (0 != streamID) { 224 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); 225 226 --streamID; // map to array index 227 228 if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) { 229 data->_converters[streamID] = NULL; 230 if (data->_nextSlot > streamID) data->_nextSlot = streamID; 231 } 232 } 233 234 ucnv_close(converter); 235 236 return 0; 237} 238 239#define MAX_BUFFER_SIZE (1000) 240 241#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 242#if 0 243// we're no longer doing this check. Revive when the status in the bug changed. 244#if (U_ICU_VERSION_MAJOR_NUM > 49) 245#warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743 246#endif 247#endif 248#endif 249#define HAS_ICU_BUG_6024743 (1) 250#define HAS_ICU_BUG_6025527 (1) 251 252CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 253 UConverter *converter; 254 UErrorCode errorCode = U_ZERO_ERROR; 255 const UTF16Char *source = characters; 256 const UTF16Char *sourceLimit = source + numChars; 257 char *destination = (char *)bytes; 258 const char *destinationLimit = destination + maxByteLen; 259 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); 260 CFIndex status; 261 262 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable; 263 264 if (0 == maxByteLen) { 265 char buffer[MAX_BUFFER_SIZE]; 266 CFIndex totalLength = 0; 267 268 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { 269 destination = buffer; 270 destinationLimit = destination + MAX_BUFFER_SIZE; 271 272 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); 273 274 totalLength += (destination - buffer); 275 276 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; 277 } 278 279 if (NULL != usedByteLen) *usedByteLen = totalLength; 280 } else { 281 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); 282 283#if HAS_ICU_BUG_6024743 284/* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */ 285 if (U_BUFFER_OVERFLOW_ERROR == errorCode) { 286 const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); 287 const uint8_t *nonBase; 288 UTF32Char character; 289 290 do { 291 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates) 292 do { 293 sourceLimit = (source - 1); 294 character = *sourceLimit; 295 nonBase = bitmap; 296 297 if (CFUniCharIsSurrogateLowCharacter(character)) { 298 --sourceLimit; 299 character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character); 300 nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F); 301 character &= 0xFFFF; 302 } 303 } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase)); 304 305 if (sourceLimit > characters) { 306 source = characters; 307 destination = (char *)bytes; 308 errorCode = U_ZERO_ERROR; 309 310 ucnv_resetFromUnicode(converter); 311 312 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); 313 } 314 } while (U_BUFFER_OVERFLOW_ERROR == errorCode); 315 316 errorCode = U_BUFFER_OVERFLOW_ERROR; 317 } 318#endif 319 if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes; 320 } 321 322 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); 323 324 if (NULL != usedCharLen) { 325#if HAS_ICU_BUG_6024743 326/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ 327 if (kCFStringEncodingInvalidInputStream == status) { 328#define MAX_ERROR_BUFFER_LEN (32) 329 UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN]; 330 int8_t errorLength = MAX_ERROR_BUFFER_LEN; 331#undef MAX_ERROR_BUFFER_LEN 332 333 errorCode = U_ZERO_ERROR; 334 335 ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode); 336 337 if (U_ZERO_ERROR == errorCode) { 338 source -= errorLength; 339 } else { 340 // Gah, something is terribly wrong. Reset everything 341 source = characters; // 0 length 342 if (NULL != usedByteLen) *usedByteLen = 0; 343 } 344 } 345#endif 346 *usedCharLen = source - characters; 347 } 348 349 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); 350 351 return status; 352} 353 354CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 355 UConverter *converter; 356 UErrorCode errorCode = U_ZERO_ERROR; 357 const char *source = (const char *)bytes; 358 const char *sourceLimit = source + numBytes; 359 UTF16Char *destination = characters; 360 const UTF16Char *destinationLimit = destination + maxCharLen; 361 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); 362 CFIndex status; 363 364 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable; 365 366 if (0 == maxCharLen) { 367 UTF16Char buffer[MAX_BUFFER_SIZE]; 368 CFIndex totalLength = 0; 369 370 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { 371 destination = buffer; 372 destinationLimit = destination + MAX_BUFFER_SIZE; 373 374 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); 375 376 totalLength += (destination - buffer); 377 378 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; 379 } 380 381 if (NULL != usedCharLen) *usedCharLen = totalLength; 382 } else { 383 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); 384 385 if (NULL != usedCharLen) *usedCharLen = destination - characters; 386 } 387 388 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); 389 390 if (NULL != usedByteLen) { 391#if HAS_ICU_BUG_6024743 392 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ 393 if (kCFStringEncodingInvalidInputStream == status) { 394#define MAX_ERROR_BUFFER_LEN (32) 395 char errorBuffer[MAX_ERROR_BUFFER_LEN]; 396 int8_t errorLength = MAX_ERROR_BUFFER_LEN; 397#undef MAX_ERROR_BUFFER_LEN 398 399 errorCode = U_ZERO_ERROR; 400 401 ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode); 402 403 if (U_ZERO_ERROR == errorCode) { 404#if HAS_ICU_BUG_6025527 405 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte. 406 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength; 407#endif 408 source -= errorLength; 409 } else { 410 // Gah, something is terribly wrong. Reset everything 411 source = (const char *)bytes; // 0 length 412 if (NULL != usedCharLen) *usedCharLen = 0; 413 } 414 } 415#endif 416 417 *usedByteLen = source - (const char *)bytes; 418 } 419 420 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); 421 422 return status; 423} 424 425CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { 426 CFIndex usedCharLen; 427 return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0); 428} 429 430CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) { 431 CFIndex usedByteLen; 432 return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0); 433} 434 435CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) { 436 CFIndex count = ucnv_countAvailable(); 437 CFIndex numEncodings = 0; 438 CFStringEncoding *encodings; 439 CFStringEncoding encoding; 440 CFIndex index; 441 442 if (0 == count) return NULL; 443 444 encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0); 445 446 for (index = 0;index < count;index++) { 447 encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index)); 448 449 if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding; 450 } 451 452 if (0 == numEncodings) { 453 CFAllocatorDeallocate(allocator, encodings); 454 encodings = NULL; 455 } 456 457 *numberOfIndex = numEncodings; 458 459 return encodings; 460} 461