1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFStringEncodingConverter.c 25 Copyright (c) 1998-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include "CFInternal.h" 30#include <CoreFoundation/CFArray.h> 31#include <CoreFoundation/CFDictionary.h> 32#include "CFICUConverters.h" 33#include <CoreFoundation/CFUniChar.h> 34#include <CoreFoundation/CFPriv.h> 35#include "CFUnicodeDecomposition.h" 36#include "CFStringEncodingConverterExt.h" 37#include "CFStringEncodingConverterPriv.h" 38#include <stdlib.h> 39 40typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen); 41typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen); 42 43typedef struct { 44 const CFStringEncodingConverter *definition; 45 _CFToBytesProc toBytes; 46 _CFToUnicodeProc toUnicode; 47 _CFToUnicodeProc toCanonicalUnicode; 48 CFStringEncodingToBytesFallbackProc toBytesFallback; 49 CFStringEncodingToUnicodeFallbackProc toUnicodeFallback; 50} _CFEncodingConverter; 51 52/* Macros 53*/ 54#define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used)) 55#define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used)) 56 57#define ASCIINewLine 0x0a 58#define kSurrogateHighStart 0xD800 59#define kSurrogateHighEnd 0xDBFF 60#define kSurrogateLowStart 0xDC00 61#define kSurrogateLowEnd 0xDFFF 62 63static const uint8_t __CFMaximumConvertedLength = 20; 64 65/* Mapping 128..255 to lossy ASCII 66*/ 67static const struct { 68 unsigned char chars[4]; 69} _toLossyASCIITable[] = { 70 {{' ', 0, 0, 0}}, // NO-BREAK SPACE 71 {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK 72 {{'c', 0, 0, 0}}, // CENT SIGN 73 {{'L', 0, 0, 0}}, // POUND SIGN 74 {{'$', 0, 0, 0}}, // CURRENCY SIGN 75 {{'Y', 0, 0, 0}}, // YEN SIGN 76 {{'|', 0, 0, 0}}, // BROKEN BAR 77 {{0, 0, 0, 0}}, // SECTION SIGN 78 {{0, 0, 0, 0}}, // DIAERESIS 79 {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN 80 {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR 81 {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 82 {{0, 0, 0, 0}}, // NOT SIGN 83 {{'-', 0, 0, 0}}, // SOFT HYPHEN 84 {{'(', 'R', ')', 0}}, // REGISTERED SIGN 85 {{0, 0, 0, 0}}, // MACRON 86 {{0, 0, 0, 0}}, // DEGREE SIGN 87 {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN 88 {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO 89 {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE 90 {{0, 0, 0, 0}}, // ACUTE ACCENT 91 {{0, 0, 0, 0}}, // MICRO SIGN 92 {{0, 0, 0, 0}}, // PILCROW SIGN 93 {{0, 0, 0, 0}}, // MIDDLE DOT 94 {{0, 0, 0, 0}}, // CEDILLA 95 {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE 96 {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR 97 {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 98 {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER 99 {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF 100 {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS 101 {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK 102 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE 103 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE 104 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX 105 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE 106 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS 107 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE 108 {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE 109 {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA 110 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE 111 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE 112 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX 113 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS 114 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE 115 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE 116 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX 117 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS 118 {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic) 119 {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE 120 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE 121 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE 122 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX 123 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE 124 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS 125 {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN 126 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE 127 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE 128 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE 129 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX 130 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS 131 {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE 132 {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic) 133 {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German) 134 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE 135 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE 136 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX 137 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE 138 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS 139 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE 140 {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE 141 {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA 142 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE 143 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE 144 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX 145 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS 146 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE 147 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE 148 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX 149 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS 150 {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic) 151 {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE 152 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE 153 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE 154 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX 155 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE 156 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS 157 {{'/', 0, 0, 0}}, // DIVISION SIGN 158 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE 159 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE 160 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE 161 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX 162 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS 163 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE 164 {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic) 165 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS 166}; 167 168CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) { 169 const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]); 170 CFIndex numBytes = 0; 171 CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4); 172 173 for (idx = 0;idx < max;idx++) { 174 if (losChars[idx]) { 175 if (maxByteLen) bytes[idx] = losChars[idx]; 176 ++numBytes; 177 } else { 178 break; 179 } 180 } 181 182 return numBytes; 183} 184 185static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 186 CFIndex processCharLen = 1, filledBytesLen = 1; 187 uint8_t byte = '?'; 188 189 if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range 190 byte = (uint8_t)(*characters - 0x80); 191 } else if (*characters < 0x100) { 192 *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); 193 return 1; 194 } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { 195 processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); 196 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { 197 byte = ' '; 198 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { 199 byte = ASCIINewLine; 200 } else if (*characters == 0x2026) { // ellipsis 201 if (0 == maxByteLen) { 202 filledBytesLen = 3; 203 } else if (maxByteLen > 2) { 204 memset(bytes, '.', 3); 205 *usedByteLen = 3; 206 return processCharLen; 207 } 208 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { 209 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; 210 211 (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); 212 if (*decomposed < 0x80) { 213 byte = (uint8_t)(*decomposed); 214 } else { 215 UTF16Char theChar = *decomposed; 216 217 return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); 218 } 219 } 220 221 if (maxByteLen) *bytes = byte; 222 *usedByteLen = filledBytesLen; 223 return processCharLen; 224} 225 226static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 227 if (maxCharLen) *characters = (UniChar)'?'; 228 *usedCharLen = 1; 229 return 1; 230} 231 232#define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used)) 233#define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used)) 234 235#define EXTRA_BASE (0x0F00) 236 237/* Wrapper funcs for non-standard converters 238*/ 239static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 240 CFIndex processedCharLen = 0; 241 CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars); 242 uint8_t byte; 243 244 while (processedCharLen < length) { 245 if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break; 246 247 if (maxByteLen) bytes[processedCharLen] = byte; 248 processedCharLen++; 249 } 250 251 *usedByteLen = processedCharLen; 252 return processedCharLen; 253} 254 255static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 256 CFIndex processedByteLen = 0; 257 CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes); 258 UniChar character; 259 260 while (processedByteLen < length) { 261 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break; 262 263 if (maxCharLen) characters[processedByteLen] = character; 264 processedByteLen++; 265 } 266 267 *usedCharLen = processedByteLen; 268 return processedByteLen; 269} 270 271static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 272 CFIndex processedByteLen = 0; 273 CFIndex theUsedCharLen = 0; 274 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; 275 CFIndex usedLen; 276 UniChar character; 277 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); 278 279 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { 280 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break; 281 282 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { 283 CFIndex idx; 284 285 usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); 286 *usedCharLen = theUsedCharLen; 287 288 for (idx = 0;idx < usedLen;idx++) { 289 if (charBuffer[idx] > 0xFFFF) { // Non-BMP 290 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; 291 theUsedCharLen += 2; 292 if (maxCharLen) { 293 charBuffer[idx] = charBuffer[idx] - 0x10000; 294 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; 295 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; 296 } 297 } else { 298 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; 299 ++theUsedCharLen; 300 *(characters++) = charBuffer[idx]; 301 } 302 } 303 } else { 304 if (maxCharLen) *(characters++) = character; 305 ++theUsedCharLen; 306 } 307 processedByteLen++; 308 } 309 310 *usedCharLen = theUsedCharLen; 311 return processedByteLen; 312} 313 314static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 315 CFIndex processedCharLen = 0; 316 uint8_t byte; 317 CFIndex usedLen; 318 319 *usedByteLen = 0; 320 321 while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) { 322 if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break; 323 324 if (maxByteLen) bytes[*usedByteLen] = byte; 325 (*usedByteLen)++; 326 characters += usedLen; 327 numChars -= usedLen; 328 processedCharLen += usedLen; 329 } 330 331 return processedCharLen; 332} 333 334static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 335 CFIndex processedByteLen = 0; 336 UniChar charBuffer[__CFMaximumConvertedLength]; 337 CFIndex usedLen; 338 339 *usedCharLen = 0; 340 341 while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) { 342 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; 343 344 if (maxCharLen) { 345 CFIndex idx; 346 347 if (*usedCharLen + usedLen > maxCharLen) break; 348 349 for (idx = 0;idx < usedLen;idx++) { 350 characters[*usedCharLen + idx] = charBuffer[idx]; 351 } 352 } 353 *usedCharLen += usedLen; 354 processedByteLen++; 355 } 356 357 return processedByteLen; 358} 359 360static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 361 CFIndex processedByteLen = 0; 362 UniChar charBuffer[__CFMaximumConvertedLength]; 363 UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH]; 364 CFIndex usedLen; 365 CFIndex decompedLen; 366 CFIndex idx, decompIndex; 367 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); 368 CFIndex theUsedCharLen = 0; 369 370 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { 371 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; 372 373 for (idx = 0;idx < usedLen;idx++) { 374 if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) { 375 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH); 376 *usedCharLen = theUsedCharLen; 377 378 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) { 379 if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP 380 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; 381 theUsedCharLen += 2; 382 if (maxCharLen) { 383 charBuffer[idx] = charBuffer[idx] - 0x10000; 384 *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; 385 *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; 386 } 387 } else { 388 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; 389 ++theUsedCharLen; 390 *(characters++) = charBuffer[idx]; 391 } 392 } 393 } else { 394 if (maxCharLen) *(characters++) = charBuffer[idx]; 395 ++theUsedCharLen; 396 } 397 } 398 processedByteLen++; 399 } 400 401 *usedCharLen = theUsedCharLen; 402 return processedByteLen; 403} 404 405static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 406 CFIndex processedCharLen = 0; 407 uint8_t byteBuffer[__CFMaximumConvertedLength]; 408 CFIndex usedLen; 409 410 *usedByteLen = 0; 411 412 while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) { 413 if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break; 414 415 if (maxByteLen) { 416 CFIndex idx; 417 418 if (*usedByteLen + usedLen > maxByteLen) break; 419 420 for (idx = 0;idx <usedLen;idx++) { 421 bytes[*usedByteLen + idx] = byteBuffer[idx]; 422 } 423 } 424 425 *usedByteLen += usedLen; 426 processedCharLen++; 427 } 428 429 return processedCharLen; 430} 431 432static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 433 CFIndex processedByteLen = 0; 434 UniChar character; 435 CFIndex usedLen; 436 437 *usedCharLen = 0; 438 439 while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) { 440 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break; 441 442 if (maxCharLen) *(characters++) = character; 443 (*usedCharLen)++; 444 processedByteLen += usedLen; 445 bytes += usedLen; 446 numBytes -= usedLen; 447 } 448 449 return processedByteLen; 450} 451 452static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 453 CFIndex processedByteLen = 0; 454 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; 455 UniChar character; 456 CFIndex usedLen; 457 CFIndex decomposedLen; 458 CFIndex theUsedCharLen = 0; 459 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); 460 461 while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { 462 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break; 463 464 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { 465 CFIndex idx; 466 467 decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); 468 *usedCharLen = theUsedCharLen; 469 470 for (idx = 0;idx < decomposedLen;idx++) { 471 if (charBuffer[idx] > 0xFFFF) { // Non-BMP 472 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; 473 theUsedCharLen += 2; 474 if (maxCharLen) { 475 charBuffer[idx] = charBuffer[idx] - 0x10000; 476 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; 477 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; 478 } 479 } else { 480 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; 481 ++theUsedCharLen; 482 *(characters++) = charBuffer[idx]; 483 } 484 } 485 } else { 486 if (maxCharLen) *(characters++) = character; 487 ++theUsedCharLen; 488 } 489 490 processedByteLen += usedLen; 491 bytes += usedLen; 492 numBytes -= usedLen; 493 } 494 *usedCharLen = theUsedCharLen; 495 return processedByteLen; 496} 497 498/* static functions 499*/ 500CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) { 501#define NUM_OF_ENTRIES_CYCLE (10) 502 static uint32_t _currentIndex = 0; 503 static uint32_t _allocatedSize = 0; 504 static _CFEncodingConverter *_allocatedEntries = NULL; 505 _CFEncodingConverter *converter; 506 507 508 if ((_currentIndex + 1) >= _allocatedSize) { 509 _currentIndex = 0; 510 _allocatedSize = 0; 511 _allocatedEntries = NULL; 512 } 513 if (_allocatedEntries == NULL) { // Not allocated yet 514 _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0); 515 _allocatedSize = NUM_OF_ENTRIES_CYCLE; 516 converter = &(_allocatedEntries[_currentIndex]); 517 } else { 518 converter = &(_allocatedEntries[++_currentIndex]); 519 } 520 521 memset(converter, 0, sizeof(_CFEncodingConverter)); 522 523 converter->definition = definition; 524 525 switch (definition->encodingClass) { 526 case kCFStringEncodingConverterStandard: 527 converter->toBytes = NULL; 528 converter->toUnicode = NULL; 529 converter->toCanonicalUnicode = NULL; 530 break; 531 532 case kCFStringEncodingConverterCheapEightBit: 533 converter->toBytes = __CFToBytesCheapEightBitWrapper; 534 converter->toUnicode = __CFToUnicodeCheapEightBitWrapper; 535 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper; 536 break; 537 538 case kCFStringEncodingConverterStandardEightBit: 539 converter->toBytes = __CFToBytesStandardEightBitWrapper; 540 converter->toUnicode = __CFToUnicodeStandardEightBitWrapper; 541 converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper; 542 break; 543 544 case kCFStringEncodingConverterCheapMultiByte: 545 converter->toBytes = __CFToBytesCheapMultiByteWrapper; 546 converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper; 547 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper; 548 break; 549 550#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 551 case kCFStringEncodingConverterICU: 552 converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding); 553 break; 554#endif 555 556 case kCFStringEncodingConverterPlatformSpecific: 557 break; 558 559 default: // Shouln't be here 560 return NULL; 561 } 562 563 converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc); 564 converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc); 565 566 return converter; 567} 568 569CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) { 570 switch (encoding) { 571 case kCFStringEncodingUTF8: 572 return &__CFConverterUTF8; 573 574 case kCFStringEncodingMacRoman: 575 return &__CFConverterMacRoman; 576 577 case kCFStringEncodingWindowsLatin1: 578 return &__CFConverterWinLatin1; 579 580 case kCFStringEncodingASCII: 581 return &__CFConverterASCII; 582 583 case kCFStringEncodingISOLatin1: 584 return &__CFConverterISOLatin1; 585 586 587 case kCFStringEncodingNextStepLatin: 588 return &__CFConverterNextStepLatin; 589 590 591 default: 592 return __CFStringEncodingGetExternalConverter(encoding); 593 } 594} 595 596static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) { 597 const _CFEncodingConverter *converter = NULL; 598 const _CFEncodingConverter **commonConverterSlot = NULL; 599 static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding* 600 static CFMutableDictionaryRef mappingTable = NULL; 601 static CFSpinLock_t lock = CFSpinLockInit; 602 603 switch (encoding) { 604 case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break; 605 606 /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */ 607#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX 608 case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break; 609#elif DEPLOYMENT_TARGET_WINDOWS 610 case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break; 611#else 612#warning This case must match __defaultEncoding value defined in CFString.c 613 case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break; 614#endif 615 616 default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break; 617 } 618 619 __CFSpinLock(&lock); 620 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot); 621 __CFSpinUnlock(&lock); 622 623 if (NULL == converter) { 624 const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding); 625 626 if (NULL != definition) { 627 __CFSpinLock(&lock); 628 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot); 629 630 if (NULL == converter) { 631 converter = __CFEncodingConverterFromDefinition(definition, encoding); 632 633 if (NULL == commonConverterSlot) { 634 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL); 635 636 CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter); 637 } else { 638 *commonConverterSlot = converter; 639 } 640 } 641 __CFSpinUnlock(&lock); 642 } 643 } 644 645 return converter; 646} 647 648/* Public API 649*/ 650uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 651 if (encoding == kCFStringEncodingUTF8) { 652 static CFStringEncodingToBytesProc __CFToUTF8 = NULL; 653 CFIndex convertedCharLen; 654 CFIndex usedLen; 655 656 657 if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) { 658 (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false)); 659 } else { 660 if (!__CFToUTF8) { 661 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8); 662 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes; 663 } 664 convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen); 665 } 666 if (usedCharLen) *usedCharLen = convertedCharLen; 667 if (usedByteLen) *usedByteLen = usedLen; 668 669 if (convertedCharLen == numChars) { 670 return kCFStringEncodingConversionSuccess; 671 } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf 672 UTF16Char character = characters[convertedCharLen]; 673 674 if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream; 675 676 return kCFStringEncodingInsufficientOutputBufferLength; 677 } else { 678 return kCFStringEncodingInvalidInputStream; 679 } 680 } else { 681 const _CFEncodingConverter *converter = __CFGetConverter(encoding); 682 CFIndex usedLen = 0; 683 CFIndex localUsedByteLen; 684 CFIndex theUsedByteLen = 0; 685 uint32_t theResult = kCFStringEncodingConversionSuccess; 686 CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL; 687 CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL; 688 689 if (!converter) return kCFStringEncodingConverterUnavailable; 690 691 if (flags & kCFStringEncodingSubstituteCombinings) { 692 if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar; 693 } else { 694 isValidCombiningChar = converter->definition->isValidCombiningChar; 695 if (!(flags & kCFStringEncodingIgnoreCombinings)) { 696 toBytesPrecompose = converter->definition->toBytesPrecompose; 697 flags |= kCFStringEncodingComposeCombinings; 698 } 699 } 700 701#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 702 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen); 703#endif 704 705 /* Platform converter */ 706 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen); 707 708 while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) { 709 if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) { 710 CFIndex dummy; 711 712 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) { 713 if (toBytesPrecompose) { 714 CFIndex localUsedLen = usedLen; 715 716 while (isValidCombiningChar(characters[--usedLen])); 717 theUsedByteLen += localUsedByteLen; 718 if (converter->definition->maxBytesPerChar > 1) { 719 TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen); 720 theUsedByteLen -= localUsedByteLen; 721 } else { 722 theUsedByteLen--; 723 } 724 if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) { 725 usedLen += localUsedLen; 726 if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining 727 theUsedByteLen += localUsedByteLen; 728 theResult = kCFStringEncodingInvalidInputStream; 729 break; 730 } 731 } else if (flags & kCFStringEncodingAllowLossyConversion) { 732 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); 733 734 if (lossyByte) { 735 while (isValidCombiningChar(characters[++usedLen])); 736 localUsedByteLen = 1; 737 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; 738 } else { 739 ++usedLen; 740 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); 741 } 742 } else { 743 theResult = kCFStringEncodingInvalidInputStream; 744 break; 745 } 746 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up 747 theUsedByteLen += localUsedByteLen; 748 theResult = kCFStringEncodingInsufficientOutputBufferLength; 749 break; 750 } else if (flags & kCFStringEncodingIgnoreCombinings) { 751 while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen])); 752 } else { 753 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); 754 755 theUsedByteLen += localUsedByteLen; 756 if (lossyByte) { 757 ++usedLen; 758 localUsedByteLen = 1; 759 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; 760 } else { 761 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); 762 } 763 } 764 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up 765 theUsedByteLen += localUsedByteLen; 766 767 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { 768 CFIndex localUsedLen; 769 770 localUsedByteLen = 0; 771 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; 772 } 773 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; 774 break; 775 } else if (flags & kCFStringEncodingAllowLossyConversion) { 776 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); 777 778 theUsedByteLen += localUsedByteLen; 779 if (lossyByte) { 780 ++usedLen; 781 localUsedByteLen = 1; 782 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; 783 } else { 784 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); 785 } 786 } else { 787 theUsedByteLen += localUsedByteLen; 788 theResult = kCFStringEncodingInvalidInputStream; 789 break; 790 } 791 } 792 theUsedByteLen += localUsedByteLen; 793 } 794 795 if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) { 796 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { 797 CFIndex localUsedLen; 798 799 localUsedByteLen = 0; 800 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; 801 } 802 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; 803 } 804 if (usedByteLen) *usedByteLen = theUsedByteLen; 805 if (usedCharLen) *usedCharLen = usedLen; 806 807 return theResult; 808 } 809} 810 811uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 812 const _CFEncodingConverter *converter = __CFGetConverter(encoding); 813 CFIndex usedLen = 0; 814 CFIndex theUsedCharLen = 0; 815 CFIndex localUsedCharLen; 816 uint32_t theResult = kCFStringEncodingConversionSuccess; 817 818 if (!converter) return kCFStringEncodingConverterUnavailable; 819 820#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 821 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen); 822#endif 823 824 /* Platform converter */ 825 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen); 826 827 while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { 828 if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) { 829 CFIndex tempUsedCharLen; 830 831 if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up 832 theUsedCharLen += localUsedCharLen; 833 theResult = kCFStringEncodingInsufficientOutputBufferLength; 834 break; 835 } else if (flags & kCFStringEncodingAllowLossyConversion) { 836 theUsedCharLen += localUsedCharLen; 837 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen); 838 } else { 839 theUsedCharLen += localUsedCharLen; 840 theResult = kCFStringEncodingInvalidInputStream; 841 break; 842 } 843 } 844 theUsedCharLen += localUsedCharLen; 845 } 846 847 if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) { 848 theResult = kCFStringEncodingInsufficientOutputBufferLength; 849 } 850 if (usedCharLen) *usedCharLen = theUsedCharLen; 851 if (usedByteLen) *usedByteLen = usedLen; 852 853 return theResult; 854} 855 856CF_PRIVATE bool CFStringEncodingIsValidEncoding(uint32_t encoding) { 857 return (CFStringEncodingGetConverter(encoding) ? true : false); 858} 859 860CF_PRIVATE CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { 861 const _CFEncodingConverter *converter = __CFGetConverter(encoding); 862 863 if (converter) { 864#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 865 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes); 866#endif 867 868 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes); 869 870 if (1 == converter->definition->maxBytesPerChar) return numBytes; 871 872 if (NULL == converter->definition->toUnicodeLen) { 873 CFIndex usedByteLen = 0; 874 CFIndex totalLength = 0; 875 CFIndex usedCharLen; 876 877 while (numBytes > 0) { 878 usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen); 879 880 bytes += usedByteLen; 881 numBytes -= usedByteLen; 882 totalLength += usedCharLen; 883 884 if (numBytes > 0) { 885 if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0; 886 887 usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen); 888 889 bytes += usedByteLen; 890 numBytes -= usedByteLen; 891 totalLength += usedCharLen; 892 } 893 } 894 895 return totalLength; 896 } else { 897 return converter->definition->toUnicodeLen(flags, bytes, numBytes); 898 } 899 } 900 901 return 0; 902} 903 904CF_PRIVATE CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) { 905 const _CFEncodingConverter *converter = __CFGetConverter(encoding); 906 907 if (converter) { 908#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 909 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars); 910#endif 911 912 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars); 913 914 if (1 == converter->definition->maxBytesPerChar) return numChars; 915 916 if (NULL == converter->definition->toBytesLen) { 917 CFIndex usedByteLen; 918 919 return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, NULL, NULL, 0, &usedByteLen)) ? usedByteLen : 0); 920 } else { 921 return converter->definition->toBytesLen(flags, characters, numChars); 922 } 923 } 924 925 return 0; 926} 927 928void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) { 929 _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding); 930 931 if (NULL != converter) { 932 const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding); 933 934 converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes); 935 converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode); 936 } 937} 938 939CF_PRIVATE const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) { 940 const _CFEncodingConverter *converter = __CFGetConverter(encoding); 941 942 return ((NULL == converter) ? NULL : converter->definition); 943} 944 945static const CFStringEncoding __CFBuiltinEncodings[] = { 946 kCFStringEncodingMacRoman, 947 kCFStringEncodingWindowsLatin1, 948 kCFStringEncodingISOLatin1, 949 kCFStringEncodingNextStepLatin, 950 kCFStringEncodingASCII, 951 kCFStringEncodingUTF8, 952 /* These seven are available only in CFString-level */ 953 kCFStringEncodingNonLossyASCII, 954 955 kCFStringEncodingUTF16, 956 kCFStringEncodingUTF16BE, 957 kCFStringEncodingUTF16LE, 958 959 kCFStringEncodingUTF32, 960 kCFStringEncodingUTF32BE, 961 kCFStringEncodingUTF32LE, 962 963 kCFStringEncodingInvalidId, 964}; 965 966static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) { 967 CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF; 968 CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF; 969 970 return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2); 971} 972 973static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) { 974 CFStringEncoding last = kCFStringEncodingInvalidId; 975 const CFStringEncoding *limitEncodings = encodings + numSlots; 976 977 while (encodings < limitEncodings) { 978 if (last == *encodings) { 979 if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1)); 980 --limitEncodings; 981 } else { 982 last = *(encodings++); 983 } 984 } 985} 986 987CF_PRIVATE const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) { 988 static const CFStringEncoding *encodings = NULL; 989 990 if (NULL == encodings) { 991 CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings; 992 CFIndex numICUConverters = 0, numPlatformConverters = 0; 993#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 994 CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters); 995#else 996 CFStringEncoding *icuConverters = NULL; 997#endif 998 CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters); 999 1000 if ((NULL != icuConverters) || (NULL != platformConverters)) { 1001 CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters; 1002 1003 list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0); 1004 1005 memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings)); 1006 1007 if (NULL != icuConverters) { 1008 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters); 1009 CFAllocatorDeallocate(NULL, icuConverters); 1010 } 1011 1012 if (NULL != platformConverters) { 1013 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters); 1014 CFAllocatorDeallocate(NULL, platformConverters); 1015 } 1016 1017 CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL); 1018 __CFStringEncodingFliterDupes(list, numSlots); 1019 } 1020 if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list); 1021 } 1022 1023 return encodings; 1024} 1025 1026#undef TO_BYTE 1027#undef TO_UNICODE 1028#undef ASCIINewLine 1029#undef kSurrogateHighStart 1030#undef kSurrogateHighEnd 1031#undef kSurrogateLowStart 1032#undef kSurrogateLowEnd 1033#undef TO_BYTE_FALLBACK 1034#undef TO_UNICODE_FALLBACK 1035#undef EXTRA_BASE 1036#undef NUM_OF_ENTRIES_CYCLE 1037 1038