1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFBuiltinConverters.c
25	Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26	Responsibility: Aki Inoue
27*/
28
29#include "CFStringEncodingConverterExt.h"
30#include "CFUniChar.h"
31#include "CFUnicodeDecomposition.h"
32#include "CFUnicodePrecomposition.h"
33#include "CFStringEncodingConverterPriv.h"
34#include "CFInternal.h"
35
36#define ParagraphSeparator 0x2029
37#define ASCIINewLine 0x0a
38static int8_t __CFMapsParagraphSeparator = -1;
39
40CF_INLINE bool __CFIsParagraphSeparator(UTF16Char character) {
41    if (-1 == __CFMapsParagraphSeparator) __CFMapsParagraphSeparator = (1 ? false : true);
42
43    return ((__CFMapsParagraphSeparator && (ParagraphSeparator == character)) ? true : false);
44}
45
46/* Precomposition */
47static const uint32_t __CFLatin1CombiningCharBitmap[] = { // 0x300 ~ 0x35FF
48    0xFBB94010, 0x01800000, 0x0000000,
49};
50
51bool CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) {
52    return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false);
53}
54
55UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) {
56    if (numChars > 0) {
57        UTF32Char ch = *(character++), nextCh, composedChar;
58        CFIndex usedCharLen = 1;
59
60        if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) {
61            if (usedChars) (*usedChars) = usedCharLen;
62            return ch;
63        }
64
65        while (usedCharLen < numChars) {
66            nextCh = *(character++);
67
68            if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break;
69
70            if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) {
71                if (composedChar > 0xFFFF) { // Non-base
72                    break;
73                } else {
74                    ch = composedChar;
75                }
76            } else {
77                break;
78            }
79            ++usedCharLen;
80        }
81        if (usedChars) (*usedChars) = usedCharLen;
82        if (usedCharLen > 1) return ch;
83    }
84    return 0xFFFD;
85}
86
87/* ASCII */
88static bool __CFToASCII(uint32_t flags, UniChar character, uint8_t *byte) {
89    if (character < 0x80) {
90        *byte = (uint8_t)character;
91    } else if (__CFIsParagraphSeparator(character)) {
92        *byte = ASCIINewLine;
93    } else {
94        return false;
95    }
96    return true;
97}
98
99static bool __CFFromASCII(uint32_t flags, uint8_t byte, UniChar *character) {
100    if (byte < 0x80) {
101        *character = (UniChar)byte;
102        return true;
103    } else {
104        return false;
105    }
106}
107
108
109CF_PRIVATE const CFStringEncodingConverter __CFConverterASCII = {
110    __CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit,
111    NULL, NULL, NULL, NULL, NULL, NULL,
112};
113
114/* ISO Latin 1 (8859-1) */
115static bool __CFToISOLatin1(uint32_t flags, UniChar character, uint8_t *byte) {
116    if (character <= 0xFF) {
117        *byte = (uint8_t)character;
118    } else if (__CFIsParagraphSeparator(character)) {
119        *byte = ASCIINewLine;
120    } else {
121        return false;
122    }
123
124    return true;
125}
126
127static bool __CFFromISOLatin1(uint32_t flags, uint8_t byte, UniChar *character) {
128    *character = (UniChar)byte;
129    return true;
130}
131
132static CFIndex __CFToISOLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
133    uint8_t byte;
134    CFIndex usedCharLen;
135
136    if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
137        if (maxByteLen) *bytes = byte;
138        *usedByteLen = 1;
139        return usedCharLen;
140    } else {
141        return 0;
142    }
143}
144
145CF_PRIVATE const CFStringEncodingConverter __CFConverterISOLatin1 = {
146    __CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
147    NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
148};
149
150/* Mac Roman */
151#define NUM_MACROMAN_FROM_UNI 129
152static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = {
153    { 0x00A0, 0xCA }, /* NO-BREAK SPACE */
154    { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */
155    { 0x00A2, 0xA2 }, /* CENT SIGN */
156    { 0x00A3, 0xA3 }, /* POUND SIGN */
157    { 0x00A5, 0xB4 }, /* YEN SIGN */
158    { 0x00A7, 0xA4 }, /* SECTION SIGN */
159    { 0x00A8, 0xAC }, /* DIAERESIS */
160    { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */
161    { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */
162    { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
163    { 0x00AC, 0xC2 }, /* NOT SIGN */
164    { 0x00AE, 0xA8 }, /* REGISTERED SIGN */
165    { 0x00AF, 0xF8 }, /* MACRON */
166    { 0x00B0, 0xA1 }, /* DEGREE SIGN */
167    { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */
168    { 0x00B4, 0xAB }, /* ACUTE ACCENT */
169    { 0x00B5, 0xB5 }, /* MICRO SIGN */
170    { 0x00B6, 0xA6 }, /* PILCROW SIGN */
171    { 0x00B7, 0xE1 }, /* MIDDLE DOT */
172    { 0x00B8, 0xFC }, /* CEDILLA */
173    { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */
174    { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
175    { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */
176    { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */
177    { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */
178    { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
179    { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */
180    { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
181    { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
182    { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */
183    { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */
184    { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */
185    { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */
186    { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
187    { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
188    { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */
189    { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */
190    { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
191    { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
192    { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */
193    { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */
194    { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */
195    { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
196    { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */
197    { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
198    { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */
199    { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */
200    { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */
201    { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
202    { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
203    { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */
204    { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */
205    { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */
206    { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
207    { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */
208    { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */
209    { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */
210    { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */
211    { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */
212    { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */
213    { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */
214    { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
215    { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */
216    { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */
217    { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */
218    { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
219    { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */
220    { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */
221    { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */
222    { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */
223    { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
224    { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */
225    { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */
226    { 0x00F7, 0xD6 }, /* DIVISION SIGN */
227    { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */
228    { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */
229    { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */
230    { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
231    { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */
232    { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */
233    { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */
234    { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */
235    { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */
236    { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
237    { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */
238    { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
239    { 0x02C7, 0xFF }, /* CARON */
240    { 0x02D8, 0xF9 }, /* BREVE */
241    { 0x02D9, 0xFA }, /* DOT ABOVE */
242    { 0x02DA, 0xFB }, /* RING ABOVE */
243    { 0x02DB, 0xFE }, /* OGONEK */
244    { 0x02DC, 0xF7 }, /* SMALL TILDE */
245    { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */
246    { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */
247    { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */
248    { 0x2013, 0xD0 }, /* EN DASH */
249    { 0x2014, 0xD1 }, /* EM DASH */
250    { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */
251    { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */
252    { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */
253    { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */
254    { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */
255    { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */
256    { 0x2020, 0xA0 }, /* DAGGER */
257    { 0x2021, 0xE0 }, /* DOUBLE DAGGER */
258    { 0x2022, 0xA5 }, /* BULLET */
259    { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */
260    { 0x2030, 0xE4 }, /* PER MILLE SIGN */
261    { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
262    { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
263    { 0x2044, 0xDA }, /* FRACTION SLASH */
264    { 0x20AC, 0xDB }, /* EURO SIGN */
265    { 0x2122, 0xAA }, /* TRADE MARK SIGN */
266    { 0x2126, 0xBD }, /* OHM SIGN */
267    { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */
268    { 0x2206, 0xC6 }, /* INCREMENT */
269    { 0x220F, 0xB8 }, /* N-ARY PRODUCT */
270    { 0x2211, 0xB7 }, /* N-ARY SUMMATION */
271    { 0x221A, 0xC3 }, /* SQUARE ROOT */
272    { 0x221E, 0xB0 }, /* INFINITY */
273    { 0x222B, 0xBA }, /* INTEGRAL */
274    { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */
275    { 0x2260, 0xAD }, /* NOT EQUAL TO */
276    { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */
277    { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */
278    { 0x25CA, 0xD7 }, /* LOZENGE */
279    { 0xF8FF, 0xF0 }, /* Apple logo */
280    { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */
281    { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */
282};
283
284static bool __CFToMacRoman(uint32_t flags, UniChar character, uint8_t *byte) {
285    if (character < 0x80) {
286        *byte = (uint8_t)character;
287        return true;
288    } else {
289        return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte);
290    }
291}
292
293static const UniChar macRoman_to_uni[128] = {
294    0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
295    0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
296    0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
297    0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
298    0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
299    0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
300    0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
301    0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
302    0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
303    0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
304    0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
305    0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
306    0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
307    0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
308    0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
309    0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
310    0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
311    0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
312    0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
313    0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
314    0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
315    0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
316    0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
317    0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
318    0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
319    0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
320    0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
321    0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
322    0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
323    0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
324    0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
325    0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
326    0x2020, /* DAGGER */
327    0x00B0, /* DEGREE SIGN */
328    0x00A2, /* CENT SIGN */
329    0x00A3, /* POUND SIGN */
330    0x00A7, /* SECTION SIGN */
331    0x2022, /* BULLET */
332    0x00B6, /* PILCROW SIGN */
333    0x00DF, /* LATIN SMALL LETTER SHARP S */
334    0x00AE, /* REGISTERED SIGN */
335    0x00A9, /* COPYRIGHT SIGN */
336    0x2122, /* TRADE MARK SIGN */
337    0x00B4, /* ACUTE ACCENT */
338    0x00A8, /* DIAERESIS */
339    0x2260, /* NOT EQUAL TO */
340    0x00C6, /* LATIN CAPITAL LIGATURE AE */
341    0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
342    0x221E, /* INFINITY */
343    0x00B1, /* PLUS-MINUS SIGN */
344    0x2264, /* LESS-THAN OR EQUAL TO */
345    0x2265, /* GREATER-THAN OR EQUAL TO */
346    0x00A5, /* YEN SIGN */
347    0x00B5, /* MICRO SIGN */
348    0x2202, /* PARTIAL DIFFERENTIAL */
349    0x2211, /* N-ARY SUMMATION */
350    0x220F, /* N-ARY PRODUCT */
351    0x03C0, /* GREEK SMALL LETTER PI */
352    0x222B, /* INTEGRAL */
353    0x00AA, /* FEMININE ORDINAL INDICATOR */
354    0x00BA, /* MASCULINE ORDINAL INDICATOR */
355    0x03A9, /* OHM SIGN (Canonical mapping) */
356    0x00E6, /* LATIN SMALL LIGATURE AE */
357    0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
358    0x00BF, /* INVERTED QUESTION MARK */
359    0x00A1, /* INVERTED EXCLAMATION MARK */
360    0x00AC, /* NOT SIGN */
361    0x221A, /* SQUARE ROOT */
362    0x0192, /* LATIN SMALL LETTER F WITH HOOK */
363    0x2248, /* ALMOST EQUAL TO */
364    0x2206, /* INCREMENT */
365    0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
366    0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
367    0x2026, /* HORIZONTAL ELLIPSIS */
368    0x00A0, /* NO-BREAK SPACE */
369    0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
370    0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
371    0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
372    0x0152, /* LATIN CAPITAL LIGATURE OE */
373    0x0153, /* LATIN SMALL LIGATURE OE */
374    0x2013, /* EN DASH */
375    0x2014, /* EM DASH */
376    0x201C, /* LEFT DOUBLE QUOTATION MARK */
377    0x201D, /* RIGHT DOUBLE QUOTATION MARK */
378    0x2018, /* LEFT SINGLE QUOTATION MARK */
379    0x2019, /* RIGHT SINGLE QUOTATION MARK */
380    0x00F7, /* DIVISION SIGN */
381    0x25CA, /* LOZENGE */
382    0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
383    0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
384    0x2044, /* FRACTION SLASH */
385    0x20AC, /* EURO SIGN */
386    0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
387    0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
388    0xFB01, /* LATIN SMALL LIGATURE FI */
389    0xFB02, /* LATIN SMALL LIGATURE FL */
390    0x2021, /* DOUBLE DAGGER */
391    0x00B7, /* MIDDLE DOT */
392    0x201A, /* SINGLE LOW-9 QUOTATION MARK */
393    0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
394    0x2030, /* PER MILLE SIGN */
395    0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
396    0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
397    0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
398    0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
399    0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
400    0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
401    0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
402    0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
403    0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
404    0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
405    0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
406    0xF8FF, /* Apple logo */
407    0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
408    0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
409    0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
410    0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
411    0x0131, /* LATIN SMALL LETTER DOTLESS I */
412    0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
413    0x02DC, /* SMALL TILDE */
414    0x00AF, /* MACRON */
415    0x02D8, /* BREVE */
416    0x02D9, /* DOT ABOVE */
417    0x02DA, /* RING ABOVE */
418    0x00B8, /* CEDILLA */
419    0x02DD, /* DOUBLE ACUTE ACCENT */
420    0x02DB, /* OGONEK */
421    0x02C7, /* CARON */
422};
423
424static bool __CFFromMacRoman(uint32_t flags, uint8_t byte, UniChar *character) {
425    *character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]);
426    return true;
427}
428
429static CFIndex __CFToMacRomanPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
430    uint8_t byte;
431    CFIndex usedCharLen;
432
433    if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
434        if (maxByteLen) *bytes = byte;
435        *usedByteLen = 1;
436        return usedCharLen;
437    } else {
438        return 0;
439    }
440}
441
442CF_PRIVATE const CFStringEncodingConverter __CFConverterMacRoman = {
443    __CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit,
444    NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
445};
446
447/* Win Latin1 (ANSI CodePage 1252) */
448#define NUM_1252_FROM_UNI 27
449static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = {
450    {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE
451    {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE
452    {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON
453    {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON
454    {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
455    {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON
456    {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON
457    {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK
458    {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT
459    {0x02DC, 0x98}, // SMALL TILDE
460    {0x2013, 0x96}, // EN DASH
461    {0x2014, 0x97}, // EM DASH
462    {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK
463    {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK
464    {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK
465    {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK
466    {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK
467    {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK
468    {0x2020, 0x86}, // DAGGER
469    {0x2021, 0x87}, // DOUBLE DAGGER
470    {0x2022, 0x95}, // BULLET
471    {0x2026, 0x85}, // HORIZONTAL ELLIPSIS
472    {0x2030, 0x89}, // PER MILLE SIGN
473    {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
474    {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
475    {0x20AC, 0x80}, // EURO SIGN
476    {0x2122, 0x99}, // TRADE MARK SIGN
477};
478
479static bool __CFToWinLatin1(uint32_t flags, UniChar character, uint8_t *byte) {
480    if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) {
481        *byte = (uint8_t)character;
482        return true;
483    }
484    return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte);
485}
486
487static const uint16_t cp1252_to_uni[32] = {
488    0x20AC, //  EURO SIGN
489    0xFFFD, //  NOT USED
490    0x201A, //  SINGLE LOW-9 QUOTATION MARK
491    0x0192, //  LATIN SMALL LETTER F WITH HOOK
492    0x201E, //  DOUBLE LOW-9 QUOTATION MARK
493    0x2026, //  HORIZONTAL ELLIPSIS
494    0x2020, //  DAGGER
495    0x2021, //  DOUBLE DAGGER
496    0x02C6, //  MODIFIER LETTER CIRCUMFLEX ACCENT
497    0x2030, //  PER MILLE SIGN
498    0x0160, //  LATIN CAPITAL LETTER S WITH CARON
499    0x2039, //  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
500    0x0152, //  LATIN CAPITAL LIGATURE OE
501    0xFFFD, //  NOT USED
502    0x017D, //  LATIN CAPITAL LETTER Z WITH CARON
503    0xFFFD, //  NOT USED
504    0xFFFD, //  NOT USED
505    0x2018, //  LEFT SINGLE QUOTATION MARK
506    0x2019, //  RIGHT SINGLE QUOTATION MARK
507    0x201C, //  LEFT DOUBLE QUOTATION MARK
508    0x201D, //  RIGHT DOUBLE QUOTATION MARK
509    0x2022, //  BULLET
510    0x2013, //  EN DASH
511    0x2014, //  EM DASH
512    0x02DC, //  SMALL TILDE
513    0x2122, //  TRADE MARK SIGN
514    0x0161, //  LATIN SMALL LETTER S WITH CARON
515    0x203A, //  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
516    0x0153, //  LATIN SMALL LIGATURE OE
517    0xFFFD, //  NOT USED
518    0x017E, //  LATIN SMALL LETTER Z WITH CARON
519    0x0178, //  LATIN CAPITAL LETTER Y WITH DIAERESIS
520};
521
522static bool __CFFromWinLatin1(uint32_t flags, uint8_t byte, UniChar *character) {
523    *character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]);
524    return (*character != 0xFFFD);
525}
526
527static CFIndex __CFToWinLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
528    uint8_t byte;
529    CFIndex usedCharLen;
530
531    if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
532        if (maxByteLen) *bytes = byte;
533        *usedByteLen = 1;
534        return usedCharLen;
535    } else {
536        return 0;
537    }
538}
539
540CF_PRIVATE const CFStringEncodingConverter __CFConverterWinLatin1 = {
541    __CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
542    NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
543};
544
545/* NEXTSTEP Encoding */
546#define NUM_NEXTSTEP_FROM_UNI	127
547
548static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = {
549        { 0x00a0, 0x80 },
550        { 0x00a1, 0xa1 },
551        { 0x00a2, 0xa2 },
552        { 0x00a3, 0xa3 },
553        { 0x00a4, 0xa8 },
554        { 0x00a5, 0xa5 },
555        { 0x00a6, 0xb5 },
556        { 0x00a7, 0xa7 },
557        { 0x00a8, 0xc8 },
558        { 0x00a9, 0xa0 },
559        { 0x00aa, 0xe3 },
560        { 0x00ab, 0xab },
561        { 0x00ac, 0xbe },
562/*	{ 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */
563        { 0x00ae, 0xb0 },
564        { 0x00af, 0xc5 },
565        { 0x00b1, 0xd1 },
566        { 0x00b2, 0xc9 },
567        { 0x00b3, 0xcc },
568        { 0x00b4, 0xc2 },
569        { 0x00b5, 0x9d },
570        { 0x00b6, 0xb6 },
571        { 0x00b7, 0xb4 },
572        { 0x00b8, 0xcb },
573        { 0x00b9, 0xc0 },
574        { 0x00ba, 0xeb },
575        { 0x00bb, 0xbb },
576        { 0x00bc, 0xd2 },
577        { 0x00bd, 0xd3 },
578        { 0x00be, 0xd4 },
579        { 0x00bf, 0xbf },
580        { 0x00c0, 0x81 },
581        { 0x00c1, 0x82 },
582        { 0x00c2, 0x83 },
583        { 0x00c3, 0x84 },
584        { 0x00c4, 0x85 },
585        { 0x00c5, 0x86 },
586        { 0x00c6, 0xe1 },
587        { 0x00c7, 0x87 },
588        { 0x00c8, 0x88 },
589        { 0x00c9, 0x89 },
590        { 0x00ca, 0x8a },
591        { 0x00cb, 0x8b },
592        { 0x00cc, 0x8c },
593        { 0x00cd, 0x8d },
594        { 0x00ce, 0x8e },
595        { 0x00cf, 0x8f },
596        { 0x00d0, 0x90 },
597        { 0x00d1, 0x91 },
598        { 0x00d2, 0x92 },
599        { 0x00d3, 0x93 },
600        { 0x00d4, 0x94 },
601        { 0x00d5, 0x95 },
602        { 0x00d6, 0x96 },
603        { 0x00d7, 0x9e },
604        { 0x00d8, 0xe9 },
605        { 0x00d9, 0x97 },
606        { 0x00da, 0x98 },
607        { 0x00db, 0x99 },
608        { 0x00dc, 0x9a },
609        { 0x00dd, 0x9b },
610        { 0x00de, 0x9c },
611        { 0x00df, 0xfb },
612        { 0x00e0, 0xd5 },
613        { 0x00e1, 0xd6 },
614        { 0x00e2, 0xd7 },
615        { 0x00e3, 0xd8 },
616        { 0x00e4, 0xd9 },
617        { 0x00e5, 0xda },
618        { 0x00e6, 0xf1 },
619        { 0x00e7, 0xdb },
620        { 0x00e8, 0xdc },
621        { 0x00e9, 0xdd },
622        { 0x00ea, 0xde },
623        { 0x00eb, 0xdf },
624        { 0x00ec, 0xe0 },
625        { 0x00ed, 0xe2 },
626        { 0x00ee, 0xe4 },
627        { 0x00ef, 0xe5 },
628        { 0x00f0, 0xe6 },
629        { 0x00f1, 0xe7 },
630        { 0x00f2, 0xec },
631        { 0x00f3, 0xed },
632        { 0x00f4, 0xee },
633        { 0x00f5, 0xef },
634        { 0x00f6, 0xf0 },
635        { 0x00f7, 0x9f },
636        { 0x00f8, 0xf9 },
637        { 0x00f9, 0xf2 },
638        { 0x00fa, 0xf3 },
639        { 0x00fb, 0xf4 },
640        { 0x00fc, 0xf6 },
641        { 0x00fd, 0xf7 },
642        { 0x00fe, 0xfc },
643        { 0x00ff, 0xfd },
644        { 0x0131, 0xf5 },
645        { 0x0141, 0xe8 },
646        { 0x0142, 0xf8 },
647        { 0x0152, 0xea },
648        { 0x0153, 0xfa },
649        { 0x0192, 0xa6 },
650        { 0x02c6, 0xc3 },
651        { 0x02c7, 0xcf },
652        { 0x02cb, 0xc1 },
653        { 0x02d8, 0xc6 },
654        { 0x02d9, 0xc7 },
655        { 0x02da, 0xca },
656        { 0x02db, 0xce },
657        { 0x02dc, 0xc4 },
658        { 0x02dd, 0xcd },
659        { 0x2013, 0xb1 },
660        { 0x2014, 0xd0 },
661        { 0x2019, 0xa9 },
662        { 0x201a, 0xb8 },
663        { 0x201c, 0xaa },
664        { 0x201d, 0xba },
665        { 0x201e, 0xb9 },
666        { 0x2020, 0xb2 },
667        { 0x2021, 0xb3 },
668        { 0x2022, 0xb7 },
669        { 0x2026, 0xbc },
670        { 0x2030, 0xbd },
671        { 0x2039, 0xac },
672        { 0x203a, 0xad },
673        { 0x2044, 0xa4 },
674        { 0xfb01, 0xae },
675        { 0xfb02, 0xaf },
676        { 0xfffd, 0xff },
677};
678
679static bool __CFToNextStepLatin(uint32_t flags, UniChar character, uint8_t *byte) {
680    if (character < 0x80) {
681        *byte = (uint8_t)character;
682        return true;
683    } else if (__CFIsParagraphSeparator(character)) {
684        *byte = ASCIINewLine;
685        return true;
686    } else {
687        return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte);
688    }
689};
690
691static const UniChar NSToPrecompUnicodeTable[128] = {
692        /* NextStep Encoding	Unicode */
693        /*  128	figspace */	0x00a0,		/* 0x2007 is fig space */
694        /*  129	Agrave */	0x00c0,
695        /*  130	Aacute */	0x00c1,
696        /*  131	Acircumflex */	0x00c2,
697        /*  132	Atilde */	0x00c3,
698        /*  133	Adieresis */	0x00c4,
699        /*  134	Aring */	0x00c5,
700        /*  135	Ccedilla */	0x00c7,
701        /*  136	Egrave */	0x00c8,
702        /*  137	Eacute */	0x00c9,
703        /*  138	Ecircumflex */	0x00ca,
704        /*  139	Edieresis */	0x00cb,
705        /*  140	Igrave */	0x00cc,
706        /*  141	Iacute */	0x00cd,
707        /*  142	Icircumflex */	0x00ce,
708        /*  143	Idieresis */	0x00cf,
709        /*  144	Eth */		0x00d0,
710        /*  145	Ntilde */	0x00d1,
711        /*  146	Ograve */	0x00d2,
712        /*  147	Oacute */	0x00d3,
713        /*  148	Ocircumflex */	0x00d4,
714        /*  149	Otilde */	0x00d5,
715        /*  150	Odieresis */	0x00d6,
716        /*  151	Ugrave */	0x00d9,
717        /*  152	Uacute */	0x00da,
718        /*  153	Ucircumflex */	0x00db,
719        /*  154	Udieresis */	0x00dc,
720        /*  155	Yacute */	0x00dd,
721        /*  156	Thorn */	0x00de,
722        /*  157	mu */		0x00b5,
723        /*  158	multiply */	0x00d7,
724        /*  159	divide */	0x00f7,
725        /*  160	copyright */	0x00a9,
726        /*  161	exclamdown */	0x00a1,
727        /*  162	cent */		0x00a2,
728        /*  163	sterling */	0x00a3,
729        /*  164	fraction */	0x2044,
730        /*  165	yen */		0x00a5,
731        /*  166	florin */	0x0192,
732        /*  167	section */	0x00a7,
733        /*  168	currency */	0x00a4,
734        /*  169	quotesingle */	0x2019,
735        /*  170	quotedblleft */	0x201c,
736        /*  171	guillemotleft */ 0x00ab,
737        /*  172	guilsinglleft */ 0x2039,
738        /*  173	guilsinglright */ 0x203a,
739        /*  174	fi */		0xFB01,
740        /*  175	fl */		0xFB02,
741        /*  176	registered */	0x00ae,
742        /*  177	endash */	0x2013,
743        /*  178	dagger */	0x2020,
744        /*  179	daggerdbl */	0x2021,
745        /*  180	periodcentered */ 0x00b7,
746        /*  181	brokenbar */	0x00a6,
747        /*  182	paragraph */	0x00b6,
748        /*  183	bullet */	0x2022,
749        /*  184	quotesinglbase */ 0x201a,
750        /*  185	quotedblbase */	0x201e,
751        /*  186	quotedblright */ 0x201d,
752        /*  187	guillemotright */ 0x00bb,
753        /*  188	ellipsis */	0x2026,
754        /*  189	perthousand */	0x2030,
755        /*  190	logicalnot */	0x00ac,
756        /*  191	questiondown */	0x00bf,
757        /*  192	onesuperior */	0x00b9,
758        /*  193	grave */	0x02cb,
759        /*  194	acute */	0x00b4,
760        /*  195	circumflex */	0x02c6,
761        /*  196	tilde */	0x02dc,
762        /*  197	macron */	0x00af,
763        /*  198	breve */	0x02d8,
764        /*  199	dotaccent */	0x02d9,
765        /*  200	dieresis */	0x00a8,
766        /*  201	twosuperior */	0x00b2,
767        /*  202	ring */		0x02da,
768        /*  203	cedilla */	0x00b8,
769        /*  204	threesuperior */ 0x00b3,
770        /*  205	hungarumlaut */	0x02dd,
771        /*  206	ogonek */	0x02db,
772        /*  207	caron */	0x02c7,
773        /*  208	emdash */	0x2014,
774        /*  209	plusminus */	0x00b1,
775        /*  210	onequarter */	0x00bc,
776        /*  211	onehalf */	0x00bd,
777        /*  212	threequarters */ 0x00be,
778        /*  213	agrave */	0x00e0,
779        /*  214	aacute */	0x00e1,
780        /*  215	acircumflex */	0x00e2,
781        /*  216	atilde */	0x00e3,
782        /*  217	adieresis */	0x00e4,
783        /*  218	aring */	0x00e5,
784        /*  219	ccedilla */	0x00e7,
785        /*  220	egrave */	0x00e8,
786        /*  221	eacute */	0x00e9,
787        /*  222	ecircumflex */	0x00ea,
788        /*  223	edieresis */	0x00eb,
789        /*  224	igrave */	0x00ec,
790        /*  225	AE */		0x00c6,
791        /*  226	iacute */	0x00ed,
792        /*  227	ordfeminine */	0x00aa,
793        /*  228	icircumflex */	0x00ee,
794        /*  229	idieresis */	0x00ef,
795        /*  230	eth */		0x00f0,
796        /*  231	ntilde */	0x00f1,
797        /*  232	Lslash */	0x0141,
798        /*  233	Oslash */	0x00d8,
799        /*  234	OE */		0x0152,
800        /*  235	ordmasculine */	0x00ba,
801        /*  236	ograve */	0x00f2,
802        /*  237	oacute */	0x00f3,
803        /*  238	ocircumflex */	0x00f4,
804        /*  239	otilde */	0x00f5,
805        /*  240	odieresis */	0x00f6,
806        /*  241	ae */		0x00e6,
807        /*  242	ugrave */	0x00f9,
808        /*  243	uacute */	0x00fa,
809        /*  244	ucircumflex */	0x00fb,
810        /*  245	dotlessi */	0x0131,
811        /*  246	udieresis */	0x00fc,
812        /*  247	yacute */	0x00fd,
813        /*  248	lslash */	0x0142,
814        /*  249	oslash */	0x00f8,
815        /*  250	oe */		0x0153,
816        /*  251	germandbls */	0x00df,
817        /*  252	thorn */	0x00fe,
818        /*  253	ydieresis */	0x00ff,
819        /*  254	.notdef */	0xFFFD,
820        /*  255	.notdef */	0xFFFD
821};
822
823static bool __CFFromNextStepLatin(uint32_t flags, uint8_t byte, UniChar *character) {
824    return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD);
825}
826
827static CFIndex __CFToNextStepLatinPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
828    uint8_t byte;
829    CFIndex usedCharLen;
830
831    if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
832        if (maxByteLen) *bytes = byte;
833        *usedByteLen = 1;
834        return usedCharLen;
835    } else {
836        return 0;
837    }
838}
839
840CF_PRIVATE const CFStringEncodingConverter __CFConverterNextStepLatin = {
841    __CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit,
842    NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
843};
844
845/* UTF8 */
846/*
847 * Copyright 2001 Unicode, Inc.
848 *
849 * Disclaimer
850 *
851 * This source code is provided as is by Unicode, Inc. No claims are
852 * made as to fitness for any particular purpose. No warranties of any
853 * kind are expressed or implied. The recipient agrees to determine
854 * applicability of information provided. If this file has been
855 * purchased on magnetic or optical media from Unicode, Inc., the
856 * sole remedy for any claim will be exchange of defective media
857 * within 90 days of receipt.
858 *
859 * Limitations on Rights to Redistribute This Code
860 *
861 * Unicode, Inc. hereby grants the right to freely use the information
862 * supplied in this file in the creation of products supporting the
863 * Unicode Standard, and to make copies of this file in any form
864 * for internal or external distribution as long as this notice
865 * remains attached.
866 */
867
868static const uint32_t kReplacementCharacter =   0x0000FFFDUL;
869static const uint32_t kMaximumUCS2 =		0x0000FFFFUL;
870static const uint32_t kMaximumUTF16 =		0x0010FFFFUL;
871static const uint32_t kMaximumUCS4 =		0x7FFFFFFFUL;
872
873static const int halfShift			= 10;
874static const uint32_t halfBase		= 0x0010000UL;
875static const uint32_t halfMask		= 0x3FFUL;
876static const uint32_t kSurrogateHighStart	= 0xD800UL;
877static const uint32_t kSurrogateHighEnd	= 0xDBFFUL;
878static const uint32_t kSurrogateLowStart	= 0xDC00UL;
879static const uint32_t kSurrogateLowEnd	= 0xDFFFUL;
880
881/*
882 * Index into the table below with the first byte of a UTF-8 sequence to
883 * get the number of trailing bytes that are supposed to follow it.
884 */
885static const char trailingBytesForUTF8[256] = {
886	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
887	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
888	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
889	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
890	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
891	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
892	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
893	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
894};
895
896/*
897 * Magic values subtracted from a buffer value during UTF8 conversion.
898 * This table contains as many values as there might be trailing bytes
899 * in a UTF-8 sequence.
900 */
901static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
902					 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
903
904static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
905
906/* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code:
907        * it is adapted to be consistent with UTF16,
908        * constants have been gathered.
909        * loops & conditionals have been removed as much as possible for
910        * efficiency, in favor of drop-through switch statements.
911*/
912
913CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(uint32_t ch) {
914    if (ch < 0x80) return  1;
915    else if (ch < 0x800) return 2;
916    else if (ch < 0x10000) return 3;
917    else if (ch < 0x200000) return 4;
918    else if (ch < 0x4000000) return 5;
919    else if (ch <= kMaximumUCS4) return 6;
920    else return 0;
921}
922
923CF_INLINE uint16_t __CFToUTF8Core(uint32_t ch, uint8_t *bytes, uint32_t maxByteLen) {
924    uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch);
925    const uint32_t byteMask = 0xBF;
926    const uint32_t byteMark = 0x80;
927
928    if (!bytesToWrite) {
929        bytesToWrite = 2;
930        ch = kReplacementCharacter;
931    }
932
933    if (maxByteLen < bytesToWrite) return 0;
934
935    switch (bytesToWrite) {	/* note: code falls through cases! */
936        case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6;
937        case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6;
938        case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6;
939        case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6;
940        case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6;
941        case 1: bytes[0] =  ch | firstByteMark[bytesToWrite];
942    }
943    return bytesToWrite;
944}
945
946static CFIndex __CFToUTF8(uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
947    uint16_t bytesWritten;
948    uint32_t ch;
949    const UniChar *beginCharacter = characters;
950    const UniChar *endCharacter = characters + numChars;
951    const uint8_t *beginBytes = bytes;
952    const uint8_t *endBytes = bytes + maxByteLen;
953    bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true);
954
955    while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) {
956        ch = *(characters++);
957
958        if (ch < 0x80) { // ASCII
959            if (maxByteLen) *bytes = ch;
960            ++bytes;
961        } else {
962            if (ch >= kSurrogateHighStart) {
963                if (ch <= kSurrogateHighEnd) {
964                    if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) {
965                        ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase;
966                    } else if (isStrict) {
967                        --characters;
968                        break;
969                    }
970                } else if (isStrict && (ch <= kSurrogateLowEnd)) {
971                    --characters;
972                    break;
973                }
974            }
975
976            if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) {
977                characters -= (ch < 0x10000 ? 1 : 2);
978                break;
979            }
980            bytes += bytesWritten;
981        }
982    }
983
984    if (usedByteLen) *usedByteLen = bytes - beginBytes;
985    return characters - beginCharacter;
986}
987
988/*
989 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
990 * This must be called with the length pre-determined by the first byte.
991 * If not calling this from ConvertUTF8to*, then the length can be set by:
992 *	length = trailingBytesForUTF8[*source]+1;
993 * and the sequence is illegal right away if there aren't that many bytes
994 * available.
995 * If presented with a length > 4, this returns false.  The Unicode
996 * definition of UTF-8 goes up to 4-byte sequences.
997 */
998
999CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, CFIndex length) {
1000    if (length > 4) return false;
1001
1002    const uint8_t *srcptr = source+length;
1003    uint8_t head = *source;
1004
1005    while (--srcptr > source) if ((*srcptr & 0xC0) != 0x80) return false;
1006
1007    if (((head >= 0x80) && (head < 0xC2)) || (head > 0xF4)) return false;
1008
1009    if (((head == 0xE0) && (*(source + 1) < 0xA0)) || ((head == 0xED) && (*(source + 1) > 0x9F)) || ((head == 0xF0) && (*(source + 1) < 0x90)) || ((head == 0xF4) && (*(source + 1) > 0x8F))) return false;
1010    return true;
1011}
1012
1013static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
1014    const uint8_t *source = bytes;
1015    uint16_t extraBytesToRead;
1016    CFIndex theUsedCharLen = 0;
1017    uint32_t ch;
1018    bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1019    bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1020    bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1021    UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1022    CFIndex decompLength;
1023    bool isStrict = !isHFSPlus;
1024
1025    while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
1026        extraBytesToRead = trailingBytesForUTF8[*source];
1027
1028        if (extraBytesToRead > --numBytes) break;
1029        numBytes -= extraBytesToRead;
1030
1031        /* Do this check whether lenient or strict */
1032        // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1033        // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1034        if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
1035            if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1036                numBytes += extraBytesToRead;
1037                ++source;
1038                if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1039                ++theUsedCharLen;
1040                continue;
1041            } else {
1042                break;
1043            }
1044        }
1045
1046        ch = 0;
1047        /*
1048         * The cases all fall through. See "Note A" below.
1049         */
1050        switch (extraBytesToRead) {
1051            case 3:	ch += *source++; ch <<= 6;
1052            case 2:	ch += *source++; ch <<= 6;
1053            case 1:	ch += *source++; ch <<= 6;
1054            case 0:	ch += *source++;
1055        }
1056        ch -= offsetsFromUTF8[extraBytesToRead];
1057
1058        if (ch <= kMaximumUCS2) {
1059            if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1060                source -= (extraBytesToRead + 1);
1061                break;
1062            }
1063            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1064                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1065
1066                if (maxCharLen) {
1067                    if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
1068                } else {
1069                    theUsedCharLen += decompLength;
1070                }
1071            } else {
1072                if (maxCharLen) *(characters++) = (UTF16Char)ch;
1073                ++theUsedCharLen;
1074            }
1075        } else if (ch > kMaximumUTF16) {
1076            if (isStrict) {
1077                source -= (extraBytesToRead + 1);
1078                break;
1079            }
1080            if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1081            ++theUsedCharLen;
1082        } else {
1083            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1084                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1085
1086                if (maxCharLen) {
1087                    if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
1088                } else {
1089                    while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1090                }
1091            } else {
1092                if (maxCharLen) {
1093                    if ((theUsedCharLen + 2) > maxCharLen) break;
1094                    ch -= halfBase;
1095                    *(characters++) = (ch >> halfShift) + kSurrogateHighStart;
1096                    *(characters++) = (ch & halfMask) + kSurrogateLowStart;
1097                }
1098                theUsedCharLen += 2;
1099            }
1100        }
1101    }
1102
1103    if (usedCharLen) *usedCharLen = theUsedCharLen;
1104
1105    return source - bytes;
1106}
1107
1108static CFIndex __CFToUTF8Len(uint32_t flags, const UniChar *characters, CFIndex numChars) {
1109    uint32_t bytesToWrite = 0;
1110    uint32_t ch;
1111
1112    while (numChars) {
1113        ch = *characters++;
1114        numChars--;
1115        if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) {
1116            ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase;
1117            numChars--;
1118        }
1119        bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch);
1120    }
1121
1122    return bytesToWrite;
1123}
1124
1125static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) {
1126    uint16_t extraBytesToRead;
1127    CFIndex theUsedCharLen = 0;
1128    uint32_t ch;
1129    bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1130    bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1131    bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1132    UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1133    CFIndex decompLength;
1134    bool isStrict = !isHFSPlus;
1135
1136    while (numBytes) {
1137        extraBytesToRead = trailingBytesForUTF8[*source];
1138
1139        if (extraBytesToRead > --numBytes) break;
1140        numBytes -= extraBytesToRead;
1141
1142        /* Do this check whether lenient or strict */
1143        // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1144        // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1145        if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
1146            if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1147                numBytes += extraBytesToRead;
1148                ++source;
1149                ++theUsedCharLen;
1150                continue;
1151            } else {
1152                break;
1153            }
1154        }
1155
1156
1157        ch = 0;
1158        /*
1159         * The cases all fall through. See "Note A" below.
1160         */
1161        switch (extraBytesToRead) {
1162            case 3:	ch += *source++; ch <<= 6;
1163            case 2:	ch += *source++; ch <<= 6;
1164            case 1:	ch += *source++; ch <<= 6;
1165            case 0:	ch += *source++;
1166        }
1167        ch -= offsetsFromUTF8[extraBytesToRead];
1168
1169        if (ch <= kMaximumUCS2) {
1170            if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1171                break;
1172            }
1173            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1174                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1175                theUsedCharLen += decompLength;
1176            } else {
1177                ++theUsedCharLen;
1178            }
1179        } else if (ch > kMaximumUTF16) {
1180            ++theUsedCharLen;
1181        } else {
1182            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1183                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1184                while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1185            } else {
1186                theUsedCharLen += 2;
1187            }
1188        }
1189    }
1190
1191    return theUsedCharLen;
1192}
1193
1194CF_PRIVATE const CFStringEncodingConverter __CFConverterUTF8 = {
1195    __CFToUTF8, __CFFromUTF8, 3, 2, kCFStringEncodingConverterStandard,
1196    __CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL,
1197};
1198