1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFBuiltinConverters.c 25 Copyright (c) 1999-2013, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27*/ 28 29#include "CFStringEncodingConverterExt.h" 30#include "CFUniChar.h" 31#include "CFUnicodeDecomposition.h" 32#include "CFUnicodePrecomposition.h" 33#include "CFStringEncodingConverterPriv.h" 34#include "CFInternal.h" 35 36#define ParagraphSeparator 0x2029 37#define ASCIINewLine 0x0a 38static int8_t __CFMapsParagraphSeparator = -1; 39 40CF_INLINE bool __CFIsParagraphSeparator(UTF16Char character) { 41 if (-1 == __CFMapsParagraphSeparator) __CFMapsParagraphSeparator = (1 ? false : true); 42 43 return ((__CFMapsParagraphSeparator && (ParagraphSeparator == character)) ? true : false); 44} 45 46/* Precomposition */ 47static const uint32_t __CFLatin1CombiningCharBitmap[] = { // 0x300 ~ 0x35FF 48 0xFBB94010, 0x01800000, 0x0000000, 49}; 50 51bool CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) { 52 return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false); 53} 54 55UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) { 56 if (numChars > 0) { 57 UTF32Char ch = *(character++), nextCh, composedChar; 58 CFIndex usedCharLen = 1; 59 60 if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) { 61 if (usedChars) (*usedChars) = usedCharLen; 62 return ch; 63 } 64 65 while (usedCharLen < numChars) { 66 nextCh = *(character++); 67 68 if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break; 69 70 if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) { 71 if (composedChar > 0xFFFF) { // Non-base 72 break; 73 } else { 74 ch = composedChar; 75 } 76 } else { 77 break; 78 } 79 ++usedCharLen; 80 } 81 if (usedChars) (*usedChars) = usedCharLen; 82 if (usedCharLen > 1) return ch; 83 } 84 return 0xFFFD; 85} 86 87/* ASCII */ 88static bool __CFToASCII(uint32_t flags, UniChar character, uint8_t *byte) { 89 if (character < 0x80) { 90 *byte = (uint8_t)character; 91 } else if (__CFIsParagraphSeparator(character)) { 92 *byte = ASCIINewLine; 93 } else { 94 return false; 95 } 96 return true; 97} 98 99static bool __CFFromASCII(uint32_t flags, uint8_t byte, UniChar *character) { 100 if (byte < 0x80) { 101 *character = (UniChar)byte; 102 return true; 103 } else { 104 return false; 105 } 106} 107 108 109CF_PRIVATE const CFStringEncodingConverter __CFConverterASCII = { 110 __CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit, 111 NULL, NULL, NULL, NULL, NULL, NULL, 112}; 113 114/* ISO Latin 1 (8859-1) */ 115static bool __CFToISOLatin1(uint32_t flags, UniChar character, uint8_t *byte) { 116 if (character <= 0xFF) { 117 *byte = (uint8_t)character; 118 } else if (__CFIsParagraphSeparator(character)) { 119 *byte = ASCIINewLine; 120 } else { 121 return false; 122 } 123 124 return true; 125} 126 127static bool __CFFromISOLatin1(uint32_t flags, uint8_t byte, UniChar *character) { 128 *character = (UniChar)byte; 129 return true; 130} 131 132static CFIndex __CFToISOLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 133 uint8_t byte; 134 CFIndex usedCharLen; 135 136 if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { 137 if (maxByteLen) *bytes = byte; 138 *usedByteLen = 1; 139 return usedCharLen; 140 } else { 141 return 0; 142 } 143} 144 145CF_PRIVATE const CFStringEncodingConverter __CFConverterISOLatin1 = { 146 __CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit, 147 NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1, 148}; 149 150/* Mac Roman */ 151#define NUM_MACROMAN_FROM_UNI 129 152static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = { 153 { 0x00A0, 0xCA }, /* NO-BREAK SPACE */ 154 { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */ 155 { 0x00A2, 0xA2 }, /* CENT SIGN */ 156 { 0x00A3, 0xA3 }, /* POUND SIGN */ 157 { 0x00A5, 0xB4 }, /* YEN SIGN */ 158 { 0x00A7, 0xA4 }, /* SECTION SIGN */ 159 { 0x00A8, 0xAC }, /* DIAERESIS */ 160 { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */ 161 { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */ 162 { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ 163 { 0x00AC, 0xC2 }, /* NOT SIGN */ 164 { 0x00AE, 0xA8 }, /* REGISTERED SIGN */ 165 { 0x00AF, 0xF8 }, /* MACRON */ 166 { 0x00B0, 0xA1 }, /* DEGREE SIGN */ 167 { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */ 168 { 0x00B4, 0xAB }, /* ACUTE ACCENT */ 169 { 0x00B5, 0xB5 }, /* MICRO SIGN */ 170 { 0x00B6, 0xA6 }, /* PILCROW SIGN */ 171 { 0x00B7, 0xE1 }, /* MIDDLE DOT */ 172 { 0x00B8, 0xFC }, /* CEDILLA */ 173 { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */ 174 { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ 175 { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */ 176 { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */ 177 { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */ 178 { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ 179 { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */ 180 { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ 181 { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ 182 { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */ 183 { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */ 184 { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */ 185 { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */ 186 { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ 187 { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ 188 { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */ 189 { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */ 190 { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ 191 { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ 192 { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */ 193 { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */ 194 { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */ 195 { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ 196 { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */ 197 { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ 198 { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */ 199 { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */ 200 { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */ 201 { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ 202 { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ 203 { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */ 204 { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */ 205 { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */ 206 { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ 207 { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */ 208 { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */ 209 { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */ 210 { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */ 211 { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */ 212 { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */ 213 { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */ 214 { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ 215 { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */ 216 { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */ 217 { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */ 218 { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ 219 { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */ 220 { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */ 221 { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */ 222 { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */ 223 { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ 224 { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */ 225 { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */ 226 { 0x00F7, 0xD6 }, /* DIVISION SIGN */ 227 { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */ 228 { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */ 229 { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */ 230 { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ 231 { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */ 232 { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */ 233 { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */ 234 { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */ 235 { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */ 236 { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ 237 { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */ 238 { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */ 239 { 0x02C7, 0xFF }, /* CARON */ 240 { 0x02D8, 0xF9 }, /* BREVE */ 241 { 0x02D9, 0xFA }, /* DOT ABOVE */ 242 { 0x02DA, 0xFB }, /* RING ABOVE */ 243 { 0x02DB, 0xFE }, /* OGONEK */ 244 { 0x02DC, 0xF7 }, /* SMALL TILDE */ 245 { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */ 246 { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */ 247 { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */ 248 { 0x2013, 0xD0 }, /* EN DASH */ 249 { 0x2014, 0xD1 }, /* EM DASH */ 250 { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */ 251 { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */ 252 { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */ 253 { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */ 254 { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */ 255 { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */ 256 { 0x2020, 0xA0 }, /* DAGGER */ 257 { 0x2021, 0xE0 }, /* DOUBLE DAGGER */ 258 { 0x2022, 0xA5 }, /* BULLET */ 259 { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */ 260 { 0x2030, 0xE4 }, /* PER MILLE SIGN */ 261 { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ 262 { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ 263 { 0x2044, 0xDA }, /* FRACTION SLASH */ 264 { 0x20AC, 0xDB }, /* EURO SIGN */ 265 { 0x2122, 0xAA }, /* TRADE MARK SIGN */ 266 { 0x2126, 0xBD }, /* OHM SIGN */ 267 { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */ 268 { 0x2206, 0xC6 }, /* INCREMENT */ 269 { 0x220F, 0xB8 }, /* N-ARY PRODUCT */ 270 { 0x2211, 0xB7 }, /* N-ARY SUMMATION */ 271 { 0x221A, 0xC3 }, /* SQUARE ROOT */ 272 { 0x221E, 0xB0 }, /* INFINITY */ 273 { 0x222B, 0xBA }, /* INTEGRAL */ 274 { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */ 275 { 0x2260, 0xAD }, /* NOT EQUAL TO */ 276 { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */ 277 { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */ 278 { 0x25CA, 0xD7 }, /* LOZENGE */ 279 { 0xF8FF, 0xF0 }, /* Apple logo */ 280 { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */ 281 { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */ 282}; 283 284static bool __CFToMacRoman(uint32_t flags, UniChar character, uint8_t *byte) { 285 if (character < 0x80) { 286 *byte = (uint8_t)character; 287 return true; 288 } else { 289 return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte); 290 } 291} 292 293static const UniChar macRoman_to_uni[128] = { 294 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ 295 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ 296 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */ 297 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */ 298 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */ 299 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ 300 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ 301 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */ 302 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */ 303 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ 304 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */ 305 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */ 306 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */ 307 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */ 308 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */ 309 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */ 310 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ 311 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */ 312 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */ 313 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */ 314 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ 315 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */ 316 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */ 317 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */ 318 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */ 319 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ 320 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */ 321 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */ 322 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */ 323 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */ 324 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ 325 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */ 326 0x2020, /* DAGGER */ 327 0x00B0, /* DEGREE SIGN */ 328 0x00A2, /* CENT SIGN */ 329 0x00A3, /* POUND SIGN */ 330 0x00A7, /* SECTION SIGN */ 331 0x2022, /* BULLET */ 332 0x00B6, /* PILCROW SIGN */ 333 0x00DF, /* LATIN SMALL LETTER SHARP S */ 334 0x00AE, /* REGISTERED SIGN */ 335 0x00A9, /* COPYRIGHT SIGN */ 336 0x2122, /* TRADE MARK SIGN */ 337 0x00B4, /* ACUTE ACCENT */ 338 0x00A8, /* DIAERESIS */ 339 0x2260, /* NOT EQUAL TO */ 340 0x00C6, /* LATIN CAPITAL LIGATURE AE */ 341 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */ 342 0x221E, /* INFINITY */ 343 0x00B1, /* PLUS-MINUS SIGN */ 344 0x2264, /* LESS-THAN OR EQUAL TO */ 345 0x2265, /* GREATER-THAN OR EQUAL TO */ 346 0x00A5, /* YEN SIGN */ 347 0x00B5, /* MICRO SIGN */ 348 0x2202, /* PARTIAL DIFFERENTIAL */ 349 0x2211, /* N-ARY SUMMATION */ 350 0x220F, /* N-ARY PRODUCT */ 351 0x03C0, /* GREEK SMALL LETTER PI */ 352 0x222B, /* INTEGRAL */ 353 0x00AA, /* FEMININE ORDINAL INDICATOR */ 354 0x00BA, /* MASCULINE ORDINAL INDICATOR */ 355 0x03A9, /* OHM SIGN (Canonical mapping) */ 356 0x00E6, /* LATIN SMALL LIGATURE AE */ 357 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */ 358 0x00BF, /* INVERTED QUESTION MARK */ 359 0x00A1, /* INVERTED EXCLAMATION MARK */ 360 0x00AC, /* NOT SIGN */ 361 0x221A, /* SQUARE ROOT */ 362 0x0192, /* LATIN SMALL LETTER F WITH HOOK */ 363 0x2248, /* ALMOST EQUAL TO */ 364 0x2206, /* INCREMENT */ 365 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ 366 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ 367 0x2026, /* HORIZONTAL ELLIPSIS */ 368 0x00A0, /* NO-BREAK SPACE */ 369 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */ 370 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */ 371 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */ 372 0x0152, /* LATIN CAPITAL LIGATURE OE */ 373 0x0153, /* LATIN SMALL LIGATURE OE */ 374 0x2013, /* EN DASH */ 375 0x2014, /* EM DASH */ 376 0x201C, /* LEFT DOUBLE QUOTATION MARK */ 377 0x201D, /* RIGHT DOUBLE QUOTATION MARK */ 378 0x2018, /* LEFT SINGLE QUOTATION MARK */ 379 0x2019, /* RIGHT SINGLE QUOTATION MARK */ 380 0x00F7, /* DIVISION SIGN */ 381 0x25CA, /* LOZENGE */ 382 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */ 383 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ 384 0x2044, /* FRACTION SLASH */ 385 0x20AC, /* EURO SIGN */ 386 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ 387 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ 388 0xFB01, /* LATIN SMALL LIGATURE FI */ 389 0xFB02, /* LATIN SMALL LIGATURE FL */ 390 0x2021, /* DOUBLE DAGGER */ 391 0x00B7, /* MIDDLE DOT */ 392 0x201A, /* SINGLE LOW-9 QUOTATION MARK */ 393 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */ 394 0x2030, /* PER MILLE SIGN */ 395 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ 396 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ 397 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */ 398 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ 399 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */ 400 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */ 401 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ 402 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ 403 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */ 404 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */ 405 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ 406 0xF8FF, /* Apple logo */ 407 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */ 408 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */ 409 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ 410 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */ 411 0x0131, /* LATIN SMALL LETTER DOTLESS I */ 412 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */ 413 0x02DC, /* SMALL TILDE */ 414 0x00AF, /* MACRON */ 415 0x02D8, /* BREVE */ 416 0x02D9, /* DOT ABOVE */ 417 0x02DA, /* RING ABOVE */ 418 0x00B8, /* CEDILLA */ 419 0x02DD, /* DOUBLE ACUTE ACCENT */ 420 0x02DB, /* OGONEK */ 421 0x02C7, /* CARON */ 422}; 423 424static bool __CFFromMacRoman(uint32_t flags, uint8_t byte, UniChar *character) { 425 *character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]); 426 return true; 427} 428 429static CFIndex __CFToMacRomanPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 430 uint8_t byte; 431 CFIndex usedCharLen; 432 433 if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { 434 if (maxByteLen) *bytes = byte; 435 *usedByteLen = 1; 436 return usedCharLen; 437 } else { 438 return 0; 439 } 440} 441 442CF_PRIVATE const CFStringEncodingConverter __CFConverterMacRoman = { 443 __CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit, 444 NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1, 445}; 446 447/* Win Latin1 (ANSI CodePage 1252) */ 448#define NUM_1252_FROM_UNI 27 449static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = { 450 {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE 451 {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE 452 {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON 453 {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON 454 {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS 455 {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON 456 {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON 457 {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK 458 {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT 459 {0x02DC, 0x98}, // SMALL TILDE 460 {0x2013, 0x96}, // EN DASH 461 {0x2014, 0x97}, // EM DASH 462 {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK 463 {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK 464 {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK 465 {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK 466 {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK 467 {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK 468 {0x2020, 0x86}, // DAGGER 469 {0x2021, 0x87}, // DOUBLE DAGGER 470 {0x2022, 0x95}, // BULLET 471 {0x2026, 0x85}, // HORIZONTAL ELLIPSIS 472 {0x2030, 0x89}, // PER MILLE SIGN 473 {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK 474 {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 475 {0x20AC, 0x80}, // EURO SIGN 476 {0x2122, 0x99}, // TRADE MARK SIGN 477}; 478 479static bool __CFToWinLatin1(uint32_t flags, UniChar character, uint8_t *byte) { 480 if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) { 481 *byte = (uint8_t)character; 482 return true; 483 } 484 return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte); 485} 486 487static const uint16_t cp1252_to_uni[32] = { 488 0x20AC, // EURO SIGN 489 0xFFFD, // NOT USED 490 0x201A, // SINGLE LOW-9 QUOTATION MARK 491 0x0192, // LATIN SMALL LETTER F WITH HOOK 492 0x201E, // DOUBLE LOW-9 QUOTATION MARK 493 0x2026, // HORIZONTAL ELLIPSIS 494 0x2020, // DAGGER 495 0x2021, // DOUBLE DAGGER 496 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT 497 0x2030, // PER MILLE SIGN 498 0x0160, // LATIN CAPITAL LETTER S WITH CARON 499 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK 500 0x0152, // LATIN CAPITAL LIGATURE OE 501 0xFFFD, // NOT USED 502 0x017D, // LATIN CAPITAL LETTER Z WITH CARON 503 0xFFFD, // NOT USED 504 0xFFFD, // NOT USED 505 0x2018, // LEFT SINGLE QUOTATION MARK 506 0x2019, // RIGHT SINGLE QUOTATION MARK 507 0x201C, // LEFT DOUBLE QUOTATION MARK 508 0x201D, // RIGHT DOUBLE QUOTATION MARK 509 0x2022, // BULLET 510 0x2013, // EN DASH 511 0x2014, // EM DASH 512 0x02DC, // SMALL TILDE 513 0x2122, // TRADE MARK SIGN 514 0x0161, // LATIN SMALL LETTER S WITH CARON 515 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 516 0x0153, // LATIN SMALL LIGATURE OE 517 0xFFFD, // NOT USED 518 0x017E, // LATIN SMALL LETTER Z WITH CARON 519 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS 520}; 521 522static bool __CFFromWinLatin1(uint32_t flags, uint8_t byte, UniChar *character) { 523 *character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]); 524 return (*character != 0xFFFD); 525} 526 527static CFIndex __CFToWinLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 528 uint8_t byte; 529 CFIndex usedCharLen; 530 531 if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { 532 if (maxByteLen) *bytes = byte; 533 *usedByteLen = 1; 534 return usedCharLen; 535 } else { 536 return 0; 537 } 538} 539 540CF_PRIVATE const CFStringEncodingConverter __CFConverterWinLatin1 = { 541 __CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit, 542 NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1, 543}; 544 545/* NEXTSTEP Encoding */ 546#define NUM_NEXTSTEP_FROM_UNI 127 547 548static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = { 549 { 0x00a0, 0x80 }, 550 { 0x00a1, 0xa1 }, 551 { 0x00a2, 0xa2 }, 552 { 0x00a3, 0xa3 }, 553 { 0x00a4, 0xa8 }, 554 { 0x00a5, 0xa5 }, 555 { 0x00a6, 0xb5 }, 556 { 0x00a7, 0xa7 }, 557 { 0x00a8, 0xc8 }, 558 { 0x00a9, 0xa0 }, 559 { 0x00aa, 0xe3 }, 560 { 0x00ab, 0xab }, 561 { 0x00ac, 0xbe }, 562/* { 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */ 563 { 0x00ae, 0xb0 }, 564 { 0x00af, 0xc5 }, 565 { 0x00b1, 0xd1 }, 566 { 0x00b2, 0xc9 }, 567 { 0x00b3, 0xcc }, 568 { 0x00b4, 0xc2 }, 569 { 0x00b5, 0x9d }, 570 { 0x00b6, 0xb6 }, 571 { 0x00b7, 0xb4 }, 572 { 0x00b8, 0xcb }, 573 { 0x00b9, 0xc0 }, 574 { 0x00ba, 0xeb }, 575 { 0x00bb, 0xbb }, 576 { 0x00bc, 0xd2 }, 577 { 0x00bd, 0xd3 }, 578 { 0x00be, 0xd4 }, 579 { 0x00bf, 0xbf }, 580 { 0x00c0, 0x81 }, 581 { 0x00c1, 0x82 }, 582 { 0x00c2, 0x83 }, 583 { 0x00c3, 0x84 }, 584 { 0x00c4, 0x85 }, 585 { 0x00c5, 0x86 }, 586 { 0x00c6, 0xe1 }, 587 { 0x00c7, 0x87 }, 588 { 0x00c8, 0x88 }, 589 { 0x00c9, 0x89 }, 590 { 0x00ca, 0x8a }, 591 { 0x00cb, 0x8b }, 592 { 0x00cc, 0x8c }, 593 { 0x00cd, 0x8d }, 594 { 0x00ce, 0x8e }, 595 { 0x00cf, 0x8f }, 596 { 0x00d0, 0x90 }, 597 { 0x00d1, 0x91 }, 598 { 0x00d2, 0x92 }, 599 { 0x00d3, 0x93 }, 600 { 0x00d4, 0x94 }, 601 { 0x00d5, 0x95 }, 602 { 0x00d6, 0x96 }, 603 { 0x00d7, 0x9e }, 604 { 0x00d8, 0xe9 }, 605 { 0x00d9, 0x97 }, 606 { 0x00da, 0x98 }, 607 { 0x00db, 0x99 }, 608 { 0x00dc, 0x9a }, 609 { 0x00dd, 0x9b }, 610 { 0x00de, 0x9c }, 611 { 0x00df, 0xfb }, 612 { 0x00e0, 0xd5 }, 613 { 0x00e1, 0xd6 }, 614 { 0x00e2, 0xd7 }, 615 { 0x00e3, 0xd8 }, 616 { 0x00e4, 0xd9 }, 617 { 0x00e5, 0xda }, 618 { 0x00e6, 0xf1 }, 619 { 0x00e7, 0xdb }, 620 { 0x00e8, 0xdc }, 621 { 0x00e9, 0xdd }, 622 { 0x00ea, 0xde }, 623 { 0x00eb, 0xdf }, 624 { 0x00ec, 0xe0 }, 625 { 0x00ed, 0xe2 }, 626 { 0x00ee, 0xe4 }, 627 { 0x00ef, 0xe5 }, 628 { 0x00f0, 0xe6 }, 629 { 0x00f1, 0xe7 }, 630 { 0x00f2, 0xec }, 631 { 0x00f3, 0xed }, 632 { 0x00f4, 0xee }, 633 { 0x00f5, 0xef }, 634 { 0x00f6, 0xf0 }, 635 { 0x00f7, 0x9f }, 636 { 0x00f8, 0xf9 }, 637 { 0x00f9, 0xf2 }, 638 { 0x00fa, 0xf3 }, 639 { 0x00fb, 0xf4 }, 640 { 0x00fc, 0xf6 }, 641 { 0x00fd, 0xf7 }, 642 { 0x00fe, 0xfc }, 643 { 0x00ff, 0xfd }, 644 { 0x0131, 0xf5 }, 645 { 0x0141, 0xe8 }, 646 { 0x0142, 0xf8 }, 647 { 0x0152, 0xea }, 648 { 0x0153, 0xfa }, 649 { 0x0192, 0xa6 }, 650 { 0x02c6, 0xc3 }, 651 { 0x02c7, 0xcf }, 652 { 0x02cb, 0xc1 }, 653 { 0x02d8, 0xc6 }, 654 { 0x02d9, 0xc7 }, 655 { 0x02da, 0xca }, 656 { 0x02db, 0xce }, 657 { 0x02dc, 0xc4 }, 658 { 0x02dd, 0xcd }, 659 { 0x2013, 0xb1 }, 660 { 0x2014, 0xd0 }, 661 { 0x2019, 0xa9 }, 662 { 0x201a, 0xb8 }, 663 { 0x201c, 0xaa }, 664 { 0x201d, 0xba }, 665 { 0x201e, 0xb9 }, 666 { 0x2020, 0xb2 }, 667 { 0x2021, 0xb3 }, 668 { 0x2022, 0xb7 }, 669 { 0x2026, 0xbc }, 670 { 0x2030, 0xbd }, 671 { 0x2039, 0xac }, 672 { 0x203a, 0xad }, 673 { 0x2044, 0xa4 }, 674 { 0xfb01, 0xae }, 675 { 0xfb02, 0xaf }, 676 { 0xfffd, 0xff }, 677}; 678 679static bool __CFToNextStepLatin(uint32_t flags, UniChar character, uint8_t *byte) { 680 if (character < 0x80) { 681 *byte = (uint8_t)character; 682 return true; 683 } else if (__CFIsParagraphSeparator(character)) { 684 *byte = ASCIINewLine; 685 return true; 686 } else { 687 return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte); 688 } 689}; 690 691static const UniChar NSToPrecompUnicodeTable[128] = { 692 /* NextStep Encoding Unicode */ 693 /* 128 figspace */ 0x00a0, /* 0x2007 is fig space */ 694 /* 129 Agrave */ 0x00c0, 695 /* 130 Aacute */ 0x00c1, 696 /* 131 Acircumflex */ 0x00c2, 697 /* 132 Atilde */ 0x00c3, 698 /* 133 Adieresis */ 0x00c4, 699 /* 134 Aring */ 0x00c5, 700 /* 135 Ccedilla */ 0x00c7, 701 /* 136 Egrave */ 0x00c8, 702 /* 137 Eacute */ 0x00c9, 703 /* 138 Ecircumflex */ 0x00ca, 704 /* 139 Edieresis */ 0x00cb, 705 /* 140 Igrave */ 0x00cc, 706 /* 141 Iacute */ 0x00cd, 707 /* 142 Icircumflex */ 0x00ce, 708 /* 143 Idieresis */ 0x00cf, 709 /* 144 Eth */ 0x00d0, 710 /* 145 Ntilde */ 0x00d1, 711 /* 146 Ograve */ 0x00d2, 712 /* 147 Oacute */ 0x00d3, 713 /* 148 Ocircumflex */ 0x00d4, 714 /* 149 Otilde */ 0x00d5, 715 /* 150 Odieresis */ 0x00d6, 716 /* 151 Ugrave */ 0x00d9, 717 /* 152 Uacute */ 0x00da, 718 /* 153 Ucircumflex */ 0x00db, 719 /* 154 Udieresis */ 0x00dc, 720 /* 155 Yacute */ 0x00dd, 721 /* 156 Thorn */ 0x00de, 722 /* 157 mu */ 0x00b5, 723 /* 158 multiply */ 0x00d7, 724 /* 159 divide */ 0x00f7, 725 /* 160 copyright */ 0x00a9, 726 /* 161 exclamdown */ 0x00a1, 727 /* 162 cent */ 0x00a2, 728 /* 163 sterling */ 0x00a3, 729 /* 164 fraction */ 0x2044, 730 /* 165 yen */ 0x00a5, 731 /* 166 florin */ 0x0192, 732 /* 167 section */ 0x00a7, 733 /* 168 currency */ 0x00a4, 734 /* 169 quotesingle */ 0x2019, 735 /* 170 quotedblleft */ 0x201c, 736 /* 171 guillemotleft */ 0x00ab, 737 /* 172 guilsinglleft */ 0x2039, 738 /* 173 guilsinglright */ 0x203a, 739 /* 174 fi */ 0xFB01, 740 /* 175 fl */ 0xFB02, 741 /* 176 registered */ 0x00ae, 742 /* 177 endash */ 0x2013, 743 /* 178 dagger */ 0x2020, 744 /* 179 daggerdbl */ 0x2021, 745 /* 180 periodcentered */ 0x00b7, 746 /* 181 brokenbar */ 0x00a6, 747 /* 182 paragraph */ 0x00b6, 748 /* 183 bullet */ 0x2022, 749 /* 184 quotesinglbase */ 0x201a, 750 /* 185 quotedblbase */ 0x201e, 751 /* 186 quotedblright */ 0x201d, 752 /* 187 guillemotright */ 0x00bb, 753 /* 188 ellipsis */ 0x2026, 754 /* 189 perthousand */ 0x2030, 755 /* 190 logicalnot */ 0x00ac, 756 /* 191 questiondown */ 0x00bf, 757 /* 192 onesuperior */ 0x00b9, 758 /* 193 grave */ 0x02cb, 759 /* 194 acute */ 0x00b4, 760 /* 195 circumflex */ 0x02c6, 761 /* 196 tilde */ 0x02dc, 762 /* 197 macron */ 0x00af, 763 /* 198 breve */ 0x02d8, 764 /* 199 dotaccent */ 0x02d9, 765 /* 200 dieresis */ 0x00a8, 766 /* 201 twosuperior */ 0x00b2, 767 /* 202 ring */ 0x02da, 768 /* 203 cedilla */ 0x00b8, 769 /* 204 threesuperior */ 0x00b3, 770 /* 205 hungarumlaut */ 0x02dd, 771 /* 206 ogonek */ 0x02db, 772 /* 207 caron */ 0x02c7, 773 /* 208 emdash */ 0x2014, 774 /* 209 plusminus */ 0x00b1, 775 /* 210 onequarter */ 0x00bc, 776 /* 211 onehalf */ 0x00bd, 777 /* 212 threequarters */ 0x00be, 778 /* 213 agrave */ 0x00e0, 779 /* 214 aacute */ 0x00e1, 780 /* 215 acircumflex */ 0x00e2, 781 /* 216 atilde */ 0x00e3, 782 /* 217 adieresis */ 0x00e4, 783 /* 218 aring */ 0x00e5, 784 /* 219 ccedilla */ 0x00e7, 785 /* 220 egrave */ 0x00e8, 786 /* 221 eacute */ 0x00e9, 787 /* 222 ecircumflex */ 0x00ea, 788 /* 223 edieresis */ 0x00eb, 789 /* 224 igrave */ 0x00ec, 790 /* 225 AE */ 0x00c6, 791 /* 226 iacute */ 0x00ed, 792 /* 227 ordfeminine */ 0x00aa, 793 /* 228 icircumflex */ 0x00ee, 794 /* 229 idieresis */ 0x00ef, 795 /* 230 eth */ 0x00f0, 796 /* 231 ntilde */ 0x00f1, 797 /* 232 Lslash */ 0x0141, 798 /* 233 Oslash */ 0x00d8, 799 /* 234 OE */ 0x0152, 800 /* 235 ordmasculine */ 0x00ba, 801 /* 236 ograve */ 0x00f2, 802 /* 237 oacute */ 0x00f3, 803 /* 238 ocircumflex */ 0x00f4, 804 /* 239 otilde */ 0x00f5, 805 /* 240 odieresis */ 0x00f6, 806 /* 241 ae */ 0x00e6, 807 /* 242 ugrave */ 0x00f9, 808 /* 243 uacute */ 0x00fa, 809 /* 244 ucircumflex */ 0x00fb, 810 /* 245 dotlessi */ 0x0131, 811 /* 246 udieresis */ 0x00fc, 812 /* 247 yacute */ 0x00fd, 813 /* 248 lslash */ 0x0142, 814 /* 249 oslash */ 0x00f8, 815 /* 250 oe */ 0x0153, 816 /* 251 germandbls */ 0x00df, 817 /* 252 thorn */ 0x00fe, 818 /* 253 ydieresis */ 0x00ff, 819 /* 254 .notdef */ 0xFFFD, 820 /* 255 .notdef */ 0xFFFD 821}; 822 823static bool __CFFromNextStepLatin(uint32_t flags, uint8_t byte, UniChar *character) { 824 return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD); 825} 826 827static CFIndex __CFToNextStepLatinPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 828 uint8_t byte; 829 CFIndex usedCharLen; 830 831 if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { 832 if (maxByteLen) *bytes = byte; 833 *usedByteLen = 1; 834 return usedCharLen; 835 } else { 836 return 0; 837 } 838} 839 840CF_PRIVATE const CFStringEncodingConverter __CFConverterNextStepLatin = { 841 __CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit, 842 NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1, 843}; 844 845/* UTF8 */ 846/* 847 * Copyright 2001 Unicode, Inc. 848 * 849 * Disclaimer 850 * 851 * This source code is provided as is by Unicode, Inc. No claims are 852 * made as to fitness for any particular purpose. No warranties of any 853 * kind are expressed or implied. The recipient agrees to determine 854 * applicability of information provided. If this file has been 855 * purchased on magnetic or optical media from Unicode, Inc., the 856 * sole remedy for any claim will be exchange of defective media 857 * within 90 days of receipt. 858 * 859 * Limitations on Rights to Redistribute This Code 860 * 861 * Unicode, Inc. hereby grants the right to freely use the information 862 * supplied in this file in the creation of products supporting the 863 * Unicode Standard, and to make copies of this file in any form 864 * for internal or external distribution as long as this notice 865 * remains attached. 866 */ 867 868static const uint32_t kReplacementCharacter = 0x0000FFFDUL; 869static const uint32_t kMaximumUCS2 = 0x0000FFFFUL; 870static const uint32_t kMaximumUTF16 = 0x0010FFFFUL; 871static const uint32_t kMaximumUCS4 = 0x7FFFFFFFUL; 872 873static const int halfShift = 10; 874static const uint32_t halfBase = 0x0010000UL; 875static const uint32_t halfMask = 0x3FFUL; 876static const uint32_t kSurrogateHighStart = 0xD800UL; 877static const uint32_t kSurrogateHighEnd = 0xDBFFUL; 878static const uint32_t kSurrogateLowStart = 0xDC00UL; 879static const uint32_t kSurrogateLowEnd = 0xDFFFUL; 880 881/* 882 * Index into the table below with the first byte of a UTF-8 sequence to 883 * get the number of trailing bytes that are supposed to follow it. 884 */ 885static const char trailingBytesForUTF8[256] = { 886 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 887 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 888 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 889 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 890 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 891 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 892 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 893 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 894}; 895 896/* 897 * Magic values subtracted from a buffer value during UTF8 conversion. 898 * This table contains as many values as there might be trailing bytes 899 * in a UTF-8 sequence. 900 */ 901static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 902 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 903 904static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 905 906/* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code: 907 * it is adapted to be consistent with UTF16, 908 * constants have been gathered. 909 * loops & conditionals have been removed as much as possible for 910 * efficiency, in favor of drop-through switch statements. 911*/ 912 913CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(uint32_t ch) { 914 if (ch < 0x80) return 1; 915 else if (ch < 0x800) return 2; 916 else if (ch < 0x10000) return 3; 917 else if (ch < 0x200000) return 4; 918 else if (ch < 0x4000000) return 5; 919 else if (ch <= kMaximumUCS4) return 6; 920 else return 0; 921} 922 923CF_INLINE uint16_t __CFToUTF8Core(uint32_t ch, uint8_t *bytes, uint32_t maxByteLen) { 924 uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch); 925 const uint32_t byteMask = 0xBF; 926 const uint32_t byteMark = 0x80; 927 928 if (!bytesToWrite) { 929 bytesToWrite = 2; 930 ch = kReplacementCharacter; 931 } 932 933 if (maxByteLen < bytesToWrite) return 0; 934 935 switch (bytesToWrite) { /* note: code falls through cases! */ 936 case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6; 937 case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6; 938 case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6; 939 case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6; 940 case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6; 941 case 1: bytes[0] = ch | firstByteMark[bytesToWrite]; 942 } 943 return bytesToWrite; 944} 945 946static CFIndex __CFToUTF8(uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { 947 uint16_t bytesWritten; 948 uint32_t ch; 949 const UniChar *beginCharacter = characters; 950 const UniChar *endCharacter = characters + numChars; 951 const uint8_t *beginBytes = bytes; 952 const uint8_t *endBytes = bytes + maxByteLen; 953 bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true); 954 955 while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) { 956 ch = *(characters++); 957 958 if (ch < 0x80) { // ASCII 959 if (maxByteLen) *bytes = ch; 960 ++bytes; 961 } else { 962 if (ch >= kSurrogateHighStart) { 963 if (ch <= kSurrogateHighEnd) { 964 if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) { 965 ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase; 966 } else if (isStrict) { 967 --characters; 968 break; 969 } 970 } else if (isStrict && (ch <= kSurrogateLowEnd)) { 971 --characters; 972 break; 973 } 974 } 975 976 if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) { 977 characters -= (ch < 0x10000 ? 1 : 2); 978 break; 979 } 980 bytes += bytesWritten; 981 } 982 } 983 984 if (usedByteLen) *usedByteLen = bytes - beginBytes; 985 return characters - beginCharacter; 986} 987 988/* 989 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 990 * This must be called with the length pre-determined by the first byte. 991 * If not calling this from ConvertUTF8to*, then the length can be set by: 992 * length = trailingBytesForUTF8[*source]+1; 993 * and the sequence is illegal right away if there aren't that many bytes 994 * available. 995 * If presented with a length > 4, this returns false. The Unicode 996 * definition of UTF-8 goes up to 4-byte sequences. 997 */ 998 999CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, CFIndex length) { 1000 if (length > 4) return false; 1001 1002 const uint8_t *srcptr = source+length; 1003 uint8_t head = *source; 1004 1005 while (--srcptr > source) if ((*srcptr & 0xC0) != 0x80) return false; 1006 1007 if (((head >= 0x80) && (head < 0xC2)) || (head > 0xF4)) return false; 1008 1009 if (((head == 0xE0) && (*(source + 1) < 0xA0)) || ((head == 0xED) && (*(source + 1) > 0x9F)) || ((head == 0xF0) && (*(source + 1) < 0x90)) || ((head == 0xF4) && (*(source + 1) > 0x8F))) return false; 1010 return true; 1011} 1012 1013static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { 1014 const uint8_t *source = bytes; 1015 uint16_t extraBytesToRead; 1016 CFIndex theUsedCharLen = 0; 1017 uint32_t ch; 1018 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); 1019 bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); 1020 bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); 1021 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; 1022 CFIndex decompLength; 1023 bool isStrict = !isHFSPlus; 1024 1025 while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { 1026 extraBytesToRead = trailingBytesForUTF8[*source]; 1027 1028 if (extraBytesToRead > --numBytes) break; 1029 numBytes -= extraBytesToRead; 1030 1031 /* Do this check whether lenient or strict */ 1032 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps 1033 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release 1034 if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { 1035 if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { 1036 numBytes += extraBytesToRead; 1037 ++source; 1038 if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; 1039 ++theUsedCharLen; 1040 continue; 1041 } else { 1042 break; 1043 } 1044 } 1045 1046 ch = 0; 1047 /* 1048 * The cases all fall through. See "Note A" below. 1049 */ 1050 switch (extraBytesToRead) { 1051 case 3: ch += *source++; ch <<= 6; 1052 case 2: ch += *source++; ch <<= 6; 1053 case 1: ch += *source++; ch <<= 6; 1054 case 0: ch += *source++; 1055 } 1056 ch -= offsetsFromUTF8[extraBytesToRead]; 1057 1058 if (ch <= kMaximumUCS2) { 1059 if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { 1060 source -= (extraBytesToRead + 1); 1061 break; 1062 } 1063 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { 1064 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); 1065 1066 if (maxCharLen) { 1067 if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; 1068 } else { 1069 theUsedCharLen += decompLength; 1070 } 1071 } else { 1072 if (maxCharLen) *(characters++) = (UTF16Char)ch; 1073 ++theUsedCharLen; 1074 } 1075 } else if (ch > kMaximumUTF16) { 1076 if (isStrict) { 1077 source -= (extraBytesToRead + 1); 1078 break; 1079 } 1080 if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; 1081 ++theUsedCharLen; 1082 } else { 1083 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { 1084 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); 1085 1086 if (maxCharLen) { 1087 if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; 1088 } else { 1089 while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); 1090 } 1091 } else { 1092 if (maxCharLen) { 1093 if ((theUsedCharLen + 2) > maxCharLen) break; 1094 ch -= halfBase; 1095 *(characters++) = (ch >> halfShift) + kSurrogateHighStart; 1096 *(characters++) = (ch & halfMask) + kSurrogateLowStart; 1097 } 1098 theUsedCharLen += 2; 1099 } 1100 } 1101 } 1102 1103 if (usedCharLen) *usedCharLen = theUsedCharLen; 1104 1105 return source - bytes; 1106} 1107 1108static CFIndex __CFToUTF8Len(uint32_t flags, const UniChar *characters, CFIndex numChars) { 1109 uint32_t bytesToWrite = 0; 1110 uint32_t ch; 1111 1112 while (numChars) { 1113 ch = *characters++; 1114 numChars--; 1115 if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) { 1116 ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase; 1117 numChars--; 1118 } 1119 bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch); 1120 } 1121 1122 return bytesToWrite; 1123} 1124 1125static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) { 1126 uint16_t extraBytesToRead; 1127 CFIndex theUsedCharLen = 0; 1128 uint32_t ch; 1129 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); 1130 bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); 1131 bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); 1132 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; 1133 CFIndex decompLength; 1134 bool isStrict = !isHFSPlus; 1135 1136 while (numBytes) { 1137 extraBytesToRead = trailingBytesForUTF8[*source]; 1138 1139 if (extraBytesToRead > --numBytes) break; 1140 numBytes -= extraBytesToRead; 1141 1142 /* Do this check whether lenient or strict */ 1143 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps 1144 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release 1145 if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { 1146 if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { 1147 numBytes += extraBytesToRead; 1148 ++source; 1149 ++theUsedCharLen; 1150 continue; 1151 } else { 1152 break; 1153 } 1154 } 1155 1156 1157 ch = 0; 1158 /* 1159 * The cases all fall through. See "Note A" below. 1160 */ 1161 switch (extraBytesToRead) { 1162 case 3: ch += *source++; ch <<= 6; 1163 case 2: ch += *source++; ch <<= 6; 1164 case 1: ch += *source++; ch <<= 6; 1165 case 0: ch += *source++; 1166 } 1167 ch -= offsetsFromUTF8[extraBytesToRead]; 1168 1169 if (ch <= kMaximumUCS2) { 1170 if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { 1171 break; 1172 } 1173 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { 1174 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); 1175 theUsedCharLen += decompLength; 1176 } else { 1177 ++theUsedCharLen; 1178 } 1179 } else if (ch > kMaximumUTF16) { 1180 ++theUsedCharLen; 1181 } else { 1182 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { 1183 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); 1184 while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); 1185 } else { 1186 theUsedCharLen += 2; 1187 } 1188 } 1189 } 1190 1191 return theUsedCharLen; 1192} 1193 1194CF_PRIVATE const CFStringEncodingConverter __CFConverterUTF8 = { 1195 __CFToUTF8, __CFFromUTF8, 3, 2, kCFStringEncodingConverterStandard, 1196 __CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL, 1197}; 1198