1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFUniChar.h
25	Copyright (c) 1998-2013, Apple Inc. All rights reserved.
26*/
27
28#if !defined(__COREFOUNDATION_CFUNICHAR__)
29#define __COREFOUNDATION_CFUNICHAR__ 1
30
31
32#include <CoreFoundation/CFByteOrder.h>
33#include <CoreFoundation/CFBase.h>
34
35CF_EXTERN_C_BEGIN
36
37#define kCFUniCharBitShiftForByte	(3)
38#define kCFUniCharBitShiftForMask	(7)
39
40CF_INLINE bool CFUniCharIsSurrogateHighCharacter(UniChar character) {
41    return ((character >= 0xD800UL) && (character <= 0xDBFFUL) ? true : false);
42}
43
44CF_INLINE bool CFUniCharIsSurrogateLowCharacter(UniChar character) {
45    return ((character >= 0xDC00UL) && (character <= 0xDFFFUL) ? true : false);
46}
47
48CF_INLINE UTF32Char CFUniCharGetLongCharacterForSurrogatePair(UniChar surrogateHigh, UniChar surrogateLow) {
49    return ((surrogateHigh - 0xD800UL) << 10) + (surrogateLow - 0xDC00UL) + 0x0010000UL;
50}
51
52// The following values coinside TextEncodingFormat format defines in TextCommon.h
53enum {
54    kCFUniCharUTF16Format = 0,
55    kCFUniCharUTF8Format = 2,
56    kCFUniCharUTF32Format = 3
57};
58
59CF_INLINE bool CFUniCharIsMemberOfBitmap(UTF16Char theChar, const uint8_t *bitmap) {
60    return (bitmap && (bitmap[(theChar) >> kCFUniCharBitShiftForByte] & (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask))) ? true : false);
61}
62
63CF_INLINE void CFUniCharAddCharacterToBitmap(UTF16Char theChar, uint8_t *bitmap) {
64    bitmap[(theChar) >> kCFUniCharBitShiftForByte] |= (((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
65}
66
67CF_INLINE void CFUniCharRemoveCharacterFromBitmap(UTF16Char theChar, uint8_t *bitmap) {
68    bitmap[(theChar) >> kCFUniCharBitShiftForByte] &= ~(((uint32_t)1) << (theChar & kCFUniCharBitShiftForMask));
69}
70
71enum {
72    kCFUniCharControlCharacterSet = 1,
73    kCFUniCharWhitespaceCharacterSet,
74    kCFUniCharWhitespaceAndNewlineCharacterSet,
75    kCFUniCharDecimalDigitCharacterSet,
76    kCFUniCharLetterCharacterSet,
77    kCFUniCharLowercaseLetterCharacterSet,
78    kCFUniCharUppercaseLetterCharacterSet,
79    kCFUniCharNonBaseCharacterSet,
80    kCFUniCharCanonicalDecomposableCharacterSet,
81    kCFUniCharDecomposableCharacterSet = kCFUniCharCanonicalDecomposableCharacterSet,
82    kCFUniCharAlphaNumericCharacterSet,
83    kCFUniCharPunctuationCharacterSet,
84    kCFUniCharIllegalCharacterSet,
85    kCFUniCharTitlecaseLetterCharacterSet,
86    kCFUniCharSymbolAndOperatorCharacterSet,
87    kCFUniCharNewlineCharacterSet,
88
89    kCFUniCharCompatibilityDecomposableCharacterSet = 100, // internal character sets begins here
90    kCFUniCharHFSPlusDecomposableCharacterSet,
91    kCFUniCharStrongRightToLeftCharacterSet,
92    kCFUniCharHasNonSelfLowercaseCharacterSet,
93    kCFUniCharHasNonSelfUppercaseCharacterSet,
94    kCFUniCharHasNonSelfTitlecaseCharacterSet,
95    kCFUniCharHasNonSelfCaseFoldingCharacterSet,
96    kCFUniCharHasNonSelfMirrorMappingCharacterSet,
97    kCFUniCharControlAndFormatterCharacterSet,
98    kCFUniCharCaseIgnorableCharacterSet,
99    kCFUniCharGraphemeExtendCharacterSet
100};
101
102CF_EXPORT bool CFUniCharIsMemberOf(UTF32Char theChar, uint32_t charset);
103
104// This function returns NULL for kCFUniCharControlCharacterSet, kCFUniCharWhitespaceCharacterSet, kCFUniCharWhitespaceAndNewlineCharacterSet, & kCFUniCharIllegalCharacterSet
105CF_EXPORT const uint8_t *CFUniCharGetBitmapPtrForPlane(uint32_t charset, uint32_t plane);
106
107enum {
108    kCFUniCharBitmapFilled = (uint8_t)0,
109    kCFUniCharBitmapEmpty = (uint8_t)0xFF,
110    kCFUniCharBitmapAll = (uint8_t)1
111};
112
113CF_EXPORT uint8_t CFUniCharGetBitmapForPlane(uint32_t charset, uint32_t plane, void *bitmap, bool isInverted);
114
115CF_EXPORT uint32_t CFUniCharGetNumberOfPlanes(uint32_t charset);
116
117enum {
118    kCFUniCharToLowercase = 0,
119    kCFUniCharToUppercase,
120    kCFUniCharToTitlecase,
121    kCFUniCharCaseFold
122};
123
124enum {
125    kCFUniCharCaseMapFinalSigma = (1UL << 0),
126    kCFUniCharCaseMapAfter_i = (1UL << 1),
127    kCFUniCharCaseMapMoreAbove = (1UL << 2),
128    kCFUniCharCaseMapDutchDigraph = (1UL << 3),
129    kCFUniCharCaseMapGreekTonos = (1UL << 4)
130};
131
132CF_EXPORT CFIndex CFUniCharMapCaseTo(UTF32Char theChar, UTF16Char *convertedChar, CFIndex maxLength, uint32_t ctype, uint32_t flags, const uint8_t *langCode);
133
134CF_EXPORT uint32_t CFUniCharGetConditionalCaseMappingFlags(UTF32Char theChar, UTF16Char *buffer, CFIndex currentIndex, CFIndex length, uint32_t type, const uint8_t *langCode, uint32_t lastFlags);
135
136enum {
137    kCFUniCharBiDiPropertyON = 0,
138    kCFUniCharBiDiPropertyL,
139    kCFUniCharBiDiPropertyR,
140    kCFUniCharBiDiPropertyAN,
141    kCFUniCharBiDiPropertyEN,
142    kCFUniCharBiDiPropertyAL,
143    kCFUniCharBiDiPropertyNSM,
144    kCFUniCharBiDiPropertyCS,
145    kCFUniCharBiDiPropertyES,
146    kCFUniCharBiDiPropertyET,
147    kCFUniCharBiDiPropertyBN,
148    kCFUniCharBiDiPropertyS,
149    kCFUniCharBiDiPropertyWS,
150    kCFUniCharBiDiPropertyB,
151    kCFUniCharBiDiPropertyRLO,
152    kCFUniCharBiDiPropertyRLE,
153    kCFUniCharBiDiPropertyLRO,
154    kCFUniCharBiDiPropertyLRE,
155    kCFUniCharBiDiPropertyPDF
156};
157
158enum {
159    kCFUniCharCombiningProperty = 0,
160    kCFUniCharBidiProperty
161};
162
163// The second arg 'bitmap' has to be the pointer to a specific plane
164CF_INLINE uint8_t CFUniCharGetBidiPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
165    if (bitmap) {
166        uint8_t value = bitmap[(character >> 8)];
167
168        if (value > kCFUniCharBiDiPropertyPDF) {
169            bitmap = bitmap + 256 + ((value - kCFUniCharBiDiPropertyPDF - 1) * 256);
170            return bitmap[character % 256];
171        } else {
172            return value;
173        }
174    }
175    return kCFUniCharBiDiPropertyL;
176}
177
178CF_INLINE uint8_t CFUniCharGetCombiningPropertyForCharacter(UTF16Char character, const uint8_t *bitmap) {
179    if (bitmap) {
180        uint8_t value = bitmap[(character >> 8)];
181
182        if (value) {
183            bitmap = bitmap + 256 + ((value - 1) * 256);
184            return bitmap[character % 256];
185        }
186    }
187    return 0;
188}
189
190CF_EXPORT const void *CFUniCharGetUnicodePropertyDataForPlane(uint32_t propertyType, uint32_t plane);
191CF_EXPORT uint32_t CFUniCharGetNumberOfPlanesForUnicodePropertyData(uint32_t propertyType);
192CF_EXPORT uint32_t CFUniCharGetUnicodeProperty(UTF32Char character, uint32_t propertyType);
193
194CF_EXPORT bool CFUniCharFillDestinationBuffer(const UTF32Char *src, CFIndex srcLength, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat);
195
196// UTF32 support
197
198CF_INLINE bool CFUniCharToUTF32(const UTF16Char *src, CFIndex length, UTF32Char *dst, bool allowLossy, bool isBigEndien) {
199    const UTF16Char *limit = src + length;
200    UTF32Char character;
201
202    while (src < limit) {
203        character = *(src++);
204
205        if (CFUniCharIsSurrogateHighCharacter(character)) {
206            if ((src < limit) && CFUniCharIsSurrogateLowCharacter(*src)) {
207                character = CFUniCharGetLongCharacterForSurrogatePair(character, *(src++));
208            } else {
209                if (!allowLossy) return false;
210                character = 0xFFFD; // replacement character
211            }
212        } else if (CFUniCharIsSurrogateLowCharacter(character)) {
213            if (!allowLossy) return false;
214            character = 0xFFFD; // replacement character
215        }
216
217        *(dst++) = (isBigEndien ? CFSwapInt32HostToBig(character) : CFSwapInt32HostToLittle(character));
218    }
219
220    return true;
221}
222
223CF_INLINE bool CFUniCharFromUTF32(const UTF32Char *src, CFIndex length, UTF16Char *dst, bool allowLossy, bool isBigEndien) {
224    const UTF32Char *limit = src + length;
225    UTF32Char character;
226
227    while (src < limit) {
228        character = (isBigEndien ? CFSwapInt32BigToHost(*(src++)) : CFSwapInt32LittleToHost(*(src++)));
229
230        if (character < 0x10000) { // BMP
231            if (allowLossy) {
232                if (CFUniCharIsSurrogateHighCharacter(character)) {
233                    UTF32Char otherCharacter = 0xFFFD; // replacement character
234
235                    if (src < limit) {
236                        otherCharacter = (isBigEndien ? CFSwapInt32BigToHost(*src) : CFSwapInt32LittleToHost(*src));
237
238
239                        if ((otherCharacter < 0x10000) && CFUniCharIsSurrogateLowCharacter(otherCharacter)) {
240                            *(dst++) = character; ++src;
241                        } else {
242                            otherCharacter = 0xFFFD; // replacement character
243                        }
244                    }
245
246                    character = otherCharacter;
247                } else if (CFUniCharIsSurrogateLowCharacter(character)) {
248                    character = 0xFFFD; // replacement character
249                }
250            } else {
251                if (CFUniCharIsSurrogateHighCharacter(character) || CFUniCharIsSurrogateLowCharacter(character)) return false;
252            }
253        } else if (character < 0x110000) { // non-BMP
254            character -= 0x10000;
255            *(dst++) = (UTF16Char)((character >> 10) + 0xD800UL);
256            character = (UTF16Char)((character & 0x3FF) + 0xDC00UL);
257        } else {
258            if (!allowLossy) return false;
259            character = 0xFFFD; // replacement character
260        }
261
262        *(dst++) = character;
263    }
264    return true;
265}
266
267CF_EXTERN_C_END
268
269#endif /* ! __COREFOUNDATION_CFUNICHAR__ */
270
271