1/* Unicode character classification and properties.
2   Copyright (C) 2002, 2005-2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify it
5   under the terms of the GNU Lesser General Public License as published
6   by the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17#ifndef _UNICTYPE_H
18#define _UNICTYPE_H
19
20#include "unitypes.h"
21
22/* Get LIBUNISTRING_DLL_VARIABLE.  */
23#include <unistring/woe32dll.h>
24
25/* Get bool.  */
26#include <unistring/stdbool.h>
27
28/* Get size_t.  */
29#include <stddef.h>
30
31#ifdef __cplusplus
32extern "C" {
33#endif
34
35/* ========================================================================= */
36
37/* Field 1 of Unicode Character Database: Character name.
38   See "uniname.h".  */
39
40/* ========================================================================= */
41
42/* Field 2 of Unicode Character Database: General category.  */
43
44/* Data type denoting a General category value.  This is not just a bitmask,
45   but rather a bitmask and a pointer to the lookup table, so that programs
46   that use only the predefined bitmasks (i.e. don't combine bitmasks with &
47   and |) don't have a link-time dependency towards the big general table.  */
48typedef struct
49{
50  uint32_t bitmask : 31;
51  /*bool*/ unsigned int generic : 1;
52  union
53  {
54    const void *table;                               /* when generic is 0 */
55    bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
56  } lookup;
57}
58uc_general_category_t;
59
60/* Bits and bit masks denoting General category values.  UnicodeData-3.2.0.html
61   says a 32-bit integer will always suffice to represent them.
62   These bit masks can only be used with the uc_is_general_category_withtable
63   function.  */
64enum
65{
66  UC_CATEGORY_MASK_L  = 0x0000001f,
67  UC_CATEGORY_MASK_Lu = 0x00000001,
68  UC_CATEGORY_MASK_Ll = 0x00000002,
69  UC_CATEGORY_MASK_Lt = 0x00000004,
70  UC_CATEGORY_MASK_Lm = 0x00000008,
71  UC_CATEGORY_MASK_Lo = 0x00000010,
72  UC_CATEGORY_MASK_M  = 0x000000e0,
73  UC_CATEGORY_MASK_Mn = 0x00000020,
74  UC_CATEGORY_MASK_Mc = 0x00000040,
75  UC_CATEGORY_MASK_Me = 0x00000080,
76  UC_CATEGORY_MASK_N  = 0x00000700,
77  UC_CATEGORY_MASK_Nd = 0x00000100,
78  UC_CATEGORY_MASK_Nl = 0x00000200,
79  UC_CATEGORY_MASK_No = 0x00000400,
80  UC_CATEGORY_MASK_P  = 0x0003f800,
81  UC_CATEGORY_MASK_Pc = 0x00000800,
82  UC_CATEGORY_MASK_Pd = 0x00001000,
83  UC_CATEGORY_MASK_Ps = 0x00002000,
84  UC_CATEGORY_MASK_Pe = 0x00004000,
85  UC_CATEGORY_MASK_Pi = 0x00008000,
86  UC_CATEGORY_MASK_Pf = 0x00010000,
87  UC_CATEGORY_MASK_Po = 0x00020000,
88  UC_CATEGORY_MASK_S  = 0x003c0000,
89  UC_CATEGORY_MASK_Sm = 0x00040000,
90  UC_CATEGORY_MASK_Sc = 0x00080000,
91  UC_CATEGORY_MASK_Sk = 0x00100000,
92  UC_CATEGORY_MASK_So = 0x00200000,
93  UC_CATEGORY_MASK_Z  = 0x01c00000,
94  UC_CATEGORY_MASK_Zs = 0x00400000,
95  UC_CATEGORY_MASK_Zl = 0x00800000,
96  UC_CATEGORY_MASK_Zp = 0x01000000,
97  UC_CATEGORY_MASK_C  = 0x3e000000,
98  UC_CATEGORY_MASK_Cc = 0x02000000,
99  UC_CATEGORY_MASK_Cf = 0x04000000,
100  UC_CATEGORY_MASK_Cs = 0x08000000,
101  UC_CATEGORY_MASK_Co = 0x10000000,
102  UC_CATEGORY_MASK_Cn = 0x20000000
103};
104
105/* Predefined General category values.  */
106extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_L;
107extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lu;
108extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ll;
109extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lt;
110extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lm;
111extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lo;
112extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_M;
113extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mn;
114extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mc;
115extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Me;
116extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_N;
117extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nd;
118extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nl;
119extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_No;
120extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_P;
121extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pc;
122extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pd;
123extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ps;
124extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pe;
125extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pi;
126extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pf;
127extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Po;
128extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_S;
129extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sm;
130extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sc;
131extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sk;
132extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_So;
133extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Z;
134extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zs;
135extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zl;
136extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zp;
137extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_C;
138extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cc;
139extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cf;
140extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cs;
141extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Co;
142extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cn;
143/* Non-public.  */
144extern const uc_general_category_t _UC_CATEGORY_NONE;
145
146/* Alias names for predefined General category values.  */
147#define UC_LETTER                    UC_CATEGORY_L
148#define UC_UPPERCASE_LETTER          UC_CATEGORY_Lu
149#define UC_LOWERCASE_LETTER          UC_CATEGORY_Ll
150#define UC_TITLECASE_LETTER          UC_CATEGORY_Lt
151#define UC_MODIFIER_LETTER           UC_CATEGORY_Lm
152#define UC_OTHER_LETTER              UC_CATEGORY_Lo
153#define UC_MARK                      UC_CATEGORY_M
154#define UC_NON_SPACING_MARK          UC_CATEGORY_Mn
155#define UC_COMBINING_SPACING_MARK    UC_CATEGORY_Mc
156#define UC_ENCLOSING_MARK            UC_CATEGORY_Me
157#define UC_NUMBER                    UC_CATEGORY_N
158#define UC_DECIMAL_DIGIT_NUMBER      UC_CATEGORY_Nd
159#define UC_LETTER_NUMBER             UC_CATEGORY_Nl
160#define UC_OTHER_NUMBER              UC_CATEGORY_No
161#define UC_PUNCTUATION               UC_CATEGORY_P
162#define UC_CONNECTOR_PUNCTUATION     UC_CATEGORY_Pc
163#define UC_DASH_PUNCTUATION          UC_CATEGORY_Pd
164#define UC_OPEN_PUNCTUATION          UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
165#define UC_CLOSE_PUNCTUATION         UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
166#define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
167#define UC_FINAL_QUOTE_PUNCTUATION   UC_CATEGORY_Pf
168#define UC_OTHER_PUNCTUATION         UC_CATEGORY_Po
169#define UC_SYMBOL                    UC_CATEGORY_S
170#define UC_MATH_SYMBOL               UC_CATEGORY_Sm
171#define UC_CURRENCY_SYMBOL           UC_CATEGORY_Sc
172#define UC_MODIFIER_SYMBOL           UC_CATEGORY_Sk
173#define UC_OTHER_SYMBOL              UC_CATEGORY_So
174#define UC_SEPARATOR                 UC_CATEGORY_Z
175#define UC_SPACE_SEPARATOR           UC_CATEGORY_Zs
176#define UC_LINE_SEPARATOR            UC_CATEGORY_Zl
177#define UC_PARAGRAPH_SEPARATOR       UC_CATEGORY_Zp
178#define UC_OTHER                     UC_CATEGORY_C
179#define UC_CONTROL                   UC_CATEGORY_Cc
180#define UC_FORMAT                    UC_CATEGORY_Cf
181#define UC_SURROGATE                 UC_CATEGORY_Cs /* all of them are invalid characters */
182#define UC_PRIVATE_USE               UC_CATEGORY_Co
183#define UC_UNASSIGNED                UC_CATEGORY_Cn /* some of them are invalid characters */
184
185/* Return the union of two general categories.
186   This corresponds to the unions of the two sets of characters.  */
187extern uc_general_category_t
188       uc_general_category_or (uc_general_category_t category1,
189                               uc_general_category_t category2);
190
191/* Return the intersection of two general categories as bit masks.
192   This *does*not* correspond to the intersection of the two sets of
193   characters.  */
194extern uc_general_category_t
195       uc_general_category_and (uc_general_category_t category1,
196                                uc_general_category_t category2);
197
198/* Return the intersection of a general category with the complement of a
199   second general category, as bit masks.
200   This *does*not* correspond to the intersection with complement, when
201   viewing the categories as sets of characters.  */
202extern uc_general_category_t
203       uc_general_category_and_not (uc_general_category_t category1,
204                                    uc_general_category_t category2);
205
206/* Return the name of a general category.  */
207extern const char *
208       uc_general_category_name (uc_general_category_t category);
209
210/* Return the general category given by name, e.g. "Lu".  */
211extern uc_general_category_t
212       uc_general_category_byname (const char *category_name);
213
214/* Return the general category of a Unicode character.  */
215extern uc_general_category_t
216       uc_general_category (ucs4_t uc);
217
218/* Test whether a Unicode character belongs to a given category.
219   The CATEGORY argument can be the combination of several predefined
220   general categories.  */
221extern bool
222       uc_is_general_category (ucs4_t uc, uc_general_category_t category);
223/* Likewise.  This function uses a big table comprising all categories.  */
224extern bool
225       uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask);
226
227/* ========================================================================= */
228
229/* Field 3 of Unicode Character Database: Canonical combining class.  */
230
231/* The possible results of uc_combining_class (0..255) are described in
232   UCD.html.  The list here is not definitive; more values can be added
233   in future versions.  */
234enum
235{
236  UC_CCC_NR   =   0, /* Not Reordered */
237  UC_CCC_OV   =   1, /* Overlay */
238  UC_CCC_NK   =   7, /* Nukta */
239  UC_CCC_KV   =   8, /* Kana Voicing */
240  UC_CCC_VR   =   9, /* Virama */
241  UC_CCC_ATBL = 200, /* Attached Below Left */
242  UC_CCC_ATB  = 202, /* Attached Below */
243  UC_CCC_ATAR = 216, /* Attached Above Right */
244  UC_CCC_BL   = 218, /* Below Left */
245  UC_CCC_B    = 220, /* Below */
246  UC_CCC_BR   = 222, /* Below Right */
247  UC_CCC_L    = 224, /* Left */
248  UC_CCC_R    = 226, /* Right */
249  UC_CCC_AL   = 228, /* Above Left */
250  UC_CCC_A    = 230, /* Above */
251  UC_CCC_AR   = 232, /* Above Right */
252  UC_CCC_DB   = 233, /* Double Below */
253  UC_CCC_DA   = 234, /* Double Above */
254  UC_CCC_IS   = 240  /* Iota Subscript */
255};
256
257/* Return the canonical combining class of a Unicode character.  */
258extern int
259       uc_combining_class (ucs4_t uc);
260
261/* ========================================================================= */
262
263/* Field 4 of Unicode Character Database: Bidirectional category.  */
264
265enum
266{
267  UC_BIDI_L,   /* Left-to-Right */
268  UC_BIDI_LRE, /* Left-to-Right Embedding */
269  UC_BIDI_LRO, /* Left-to-Right Override */
270  UC_BIDI_R,   /* Right-to-Left */
271  UC_BIDI_AL,  /* Right-to-Left Arabic */
272  UC_BIDI_RLE, /* Right-to-Left Embedding */
273  UC_BIDI_RLO, /* Right-to-Left Override */
274  UC_BIDI_PDF, /* Pop Directional Format */
275  UC_BIDI_EN,  /* European Number */
276  UC_BIDI_ES,  /* European Number Separator */
277  UC_BIDI_ET,  /* European Number Terminator */
278  UC_BIDI_AN,  /* Arabic Number */
279  UC_BIDI_CS,  /* Common Number Separator */
280  UC_BIDI_NSM, /* Non-Spacing Mark */
281  UC_BIDI_BN,  /* Boundary Neutral */
282  UC_BIDI_B,   /* Paragraph Separator */
283  UC_BIDI_S,   /* Segment Separator */
284  UC_BIDI_WS,  /* Whitespace */
285  UC_BIDI_ON   /* Other Neutral */
286};
287
288/* Return the name of a bidirectional category.  */
289extern const char *
290       uc_bidi_category_name (int category);
291
292/* Return the bidirectional category given by name, e.g. "LRE".  */
293extern int
294       uc_bidi_category_byname (const char *category_name);
295
296/* Return the bidirectional category of a Unicode character.  */
297extern int
298       uc_bidi_category (ucs4_t uc);
299
300/* Test whether a Unicode character belongs to a given bidirectional
301   category.  */
302extern bool
303       uc_is_bidi_category (ucs4_t uc, int category);
304
305/* ========================================================================= */
306
307/* Field 5 of Unicode Character Database: Character decomposition mapping.
308   See "uninorm.h".  */
309
310/* ========================================================================= */
311
312/* Field 6 of Unicode Character Database: Decimal digit value.  */
313
314/* Return the decimal digit value of a Unicode character.  */
315extern int
316       uc_decimal_value (ucs4_t uc);
317
318/* ========================================================================= */
319
320/* Field 7 of Unicode Character Database: Digit value.  */
321
322/* Return the digit value of a Unicode character.  */
323extern int
324       uc_digit_value (ucs4_t uc);
325
326/* ========================================================================= */
327
328/* Field 8 of Unicode Character Database: Numeric value.  */
329
330/* Return the numeric value of a Unicode character.  */
331typedef struct
332{
333  int numerator;
334  int denominator;
335}
336uc_fraction_t;
337extern uc_fraction_t
338       uc_numeric_value (ucs4_t uc);
339
340/* ========================================================================= */
341
342/* Field 9 of Unicode Character Database: Mirrored.  */
343
344/* Return the mirrored character of a Unicode character UC in *PUC.  */
345extern bool
346       uc_mirror_char (ucs4_t uc, ucs4_t *puc);
347
348/* ========================================================================= */
349
350/* Field 10 of Unicode Character Database: Unicode 1.0 Name.
351   Not available in this library.  */
352
353/* ========================================================================= */
354
355/* Field 11 of Unicode Character Database: ISO 10646 comment.
356   Not available in this library.  */
357
358/* ========================================================================= */
359
360/* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
361   lowercase mapping, titlecase mapping.  See "unicase.h".  */
362
363/* ========================================================================= */
364
365/* Common API for properties.  */
366
367/* Data type denoting a property.  This is not just a number, but rather a
368   pointer to the test functions, so that programs that use only few of the
369   properties don't have a link-time dependency towards all the tables.  */
370typedef struct
371{
372  bool (*test_fn) (ucs4_t uc);
373}
374uc_property_t;
375
376/* Predefined properties.  */
377/* General.  */
378extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_WHITE_SPACE;
379extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ALPHABETIC;
380extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
381extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
382extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
383extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
384extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEPRECATED;
385extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
386extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
387extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PRIVATE_USE;
388extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
389/* Case.  */
390extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UPPERCASE;
391extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
392extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOWERCASE;
393extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
394extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TITLECASE;
395extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SOFT_DOTTED;
396/* Identifiers.  */
397extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_START;
398extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_START;
399extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_CONTINUE;
400extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
401extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_START;
402extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_CONTINUE;
403extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
404extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
405/* Shaping and rendering.  */
406extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_JOIN_CONTROL;
407extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
408extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
409extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
410extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
411/* Bidi.  */
412extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_CONTROL;
413extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
414extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
415extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
416extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
417extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
418extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
419extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
420extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
421extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
422extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
423extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
424extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
425extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
426extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_PDF;
427extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
428extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
429/* Numeric.  */
430extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HEX_DIGIT;
431extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
432/* CJK.  */
433extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
434extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
435extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_RADICAL;
436extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
437extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
438/* Misc.  */
439extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ZERO_WIDTH;
440extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SPACE;
441extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NON_BREAK;
442extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ISO_CONTROL;
443extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
444extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DASH;
445extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HYPHEN;
446extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PUNCTUATION;
447extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
448extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
449extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_QUOTATION_MARK;
450extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
451extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
452extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
453extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_MATH;
454extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_MATH;
455extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
456extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
457extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMBINING;
458extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMPOSITE;
459extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
460extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NUMERIC;
461extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DIACRITIC;
462extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_EXTENDER;
463extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
464
465/* Return the property given by name, e.g. "White space".  */
466extern uc_property_t
467       uc_property_byname (const char *property_name);
468
469/* Test whether a property is valid.  */
470#define uc_property_is_valid(property) ((property).test_fn != NULL)
471
472/* Test whether a Unicode character has a given property.  */
473extern bool
474       uc_is_property (ucs4_t uc, uc_property_t property);
475extern bool uc_is_property_white_space (ucs4_t uc);
476extern bool uc_is_property_alphabetic (ucs4_t uc);
477extern bool uc_is_property_other_alphabetic (ucs4_t uc);
478extern bool uc_is_property_not_a_character (ucs4_t uc);
479extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc);
480extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc);
481extern bool uc_is_property_deprecated (ucs4_t uc);
482extern bool uc_is_property_logical_order_exception (ucs4_t uc);
483extern bool uc_is_property_variation_selector (ucs4_t uc);
484extern bool uc_is_property_private_use (ucs4_t uc);
485extern bool uc_is_property_unassigned_code_value (ucs4_t uc);
486extern bool uc_is_property_uppercase (ucs4_t uc);
487extern bool uc_is_property_other_uppercase (ucs4_t uc);
488extern bool uc_is_property_lowercase (ucs4_t uc);
489extern bool uc_is_property_other_lowercase (ucs4_t uc);
490extern bool uc_is_property_titlecase (ucs4_t uc);
491extern bool uc_is_property_soft_dotted (ucs4_t uc);
492extern bool uc_is_property_id_start (ucs4_t uc);
493extern bool uc_is_property_other_id_start (ucs4_t uc);
494extern bool uc_is_property_id_continue (ucs4_t uc);
495extern bool uc_is_property_other_id_continue (ucs4_t uc);
496extern bool uc_is_property_xid_start (ucs4_t uc);
497extern bool uc_is_property_xid_continue (ucs4_t uc);
498extern bool uc_is_property_pattern_white_space (ucs4_t uc);
499extern bool uc_is_property_pattern_syntax (ucs4_t uc);
500extern bool uc_is_property_join_control (ucs4_t uc);
501extern bool uc_is_property_grapheme_base (ucs4_t uc);
502extern bool uc_is_property_grapheme_extend (ucs4_t uc);
503extern bool uc_is_property_other_grapheme_extend (ucs4_t uc);
504extern bool uc_is_property_grapheme_link (ucs4_t uc);
505extern bool uc_is_property_bidi_control (ucs4_t uc);
506extern bool uc_is_property_bidi_left_to_right (ucs4_t uc);
507extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc);
508extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc);
509extern bool uc_is_property_bidi_european_digit (ucs4_t uc);
510extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc);
511extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc);
512extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc);
513extern bool uc_is_property_bidi_common_separator (ucs4_t uc);
514extern bool uc_is_property_bidi_block_separator (ucs4_t uc);
515extern bool uc_is_property_bidi_segment_separator (ucs4_t uc);
516extern bool uc_is_property_bidi_whitespace (ucs4_t uc);
517extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc);
518extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc);
519extern bool uc_is_property_bidi_pdf (ucs4_t uc);
520extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc);
521extern bool uc_is_property_bidi_other_neutral (ucs4_t uc);
522extern bool uc_is_property_hex_digit (ucs4_t uc);
523extern bool uc_is_property_ascii_hex_digit (ucs4_t uc);
524extern bool uc_is_property_ideographic (ucs4_t uc);
525extern bool uc_is_property_unified_ideograph (ucs4_t uc);
526extern bool uc_is_property_radical (ucs4_t uc);
527extern bool uc_is_property_ids_binary_operator (ucs4_t uc);
528extern bool uc_is_property_ids_trinary_operator (ucs4_t uc);
529extern bool uc_is_property_zero_width (ucs4_t uc);
530extern bool uc_is_property_space (ucs4_t uc);
531extern bool uc_is_property_non_break (ucs4_t uc);
532extern bool uc_is_property_iso_control (ucs4_t uc);
533extern bool uc_is_property_format_control (ucs4_t uc);
534extern bool uc_is_property_dash (ucs4_t uc);
535extern bool uc_is_property_hyphen (ucs4_t uc);
536extern bool uc_is_property_punctuation (ucs4_t uc);
537extern bool uc_is_property_line_separator (ucs4_t uc);
538extern bool uc_is_property_paragraph_separator (ucs4_t uc);
539extern bool uc_is_property_quotation_mark (ucs4_t uc);
540extern bool uc_is_property_sentence_terminal (ucs4_t uc);
541extern bool uc_is_property_terminal_punctuation (ucs4_t uc);
542extern bool uc_is_property_currency_symbol (ucs4_t uc);
543extern bool uc_is_property_math (ucs4_t uc);
544extern bool uc_is_property_other_math (ucs4_t uc);
545extern bool uc_is_property_paired_punctuation (ucs4_t uc);
546extern bool uc_is_property_left_of_pair (ucs4_t uc);
547extern bool uc_is_property_combining (ucs4_t uc);
548extern bool uc_is_property_composite (ucs4_t uc);
549extern bool uc_is_property_decimal_digit (ucs4_t uc);
550extern bool uc_is_property_numeric (ucs4_t uc);
551extern bool uc_is_property_diacritic (ucs4_t uc);
552extern bool uc_is_property_extender (ucs4_t uc);
553extern bool uc_is_property_ignorable_control (ucs4_t uc);
554
555/* ========================================================================= */
556
557/* Subdivision of the Unicode characters into scripts.  */
558
559typedef struct
560{
561  unsigned int code : 21;
562  unsigned int start : 1;
563  unsigned int end : 1;
564}
565uc_interval_t;
566typedef struct
567{
568  unsigned int nintervals;
569  const uc_interval_t *intervals;
570  const char *name;
571}
572uc_script_t;
573
574/* Return the script of a Unicode character.  */
575extern const uc_script_t *
576       uc_script (ucs4_t uc);
577
578/* Return the script given by name, e.g. "HAN".  */
579extern const uc_script_t *
580       uc_script_byname (const char *script_name);
581
582/* Test whether a Unicode character belongs to a given script.  */
583extern bool
584       uc_is_script (ucs4_t uc, const uc_script_t *script);
585
586/* Get the list of all scripts.  */
587extern void
588       uc_all_scripts (const uc_script_t **scripts, size_t *count);
589
590/* ========================================================================= */
591
592/* Subdivision of the Unicode character range into blocks.  */
593
594typedef struct
595{
596  ucs4_t start;
597  ucs4_t end;
598  const char *name;
599}
600uc_block_t;
601
602/* Return the block a character belongs to.  */
603extern const uc_block_t *
604       uc_block (ucs4_t uc);
605
606/* Test whether a Unicode character belongs to a given block.  */
607extern bool
608       uc_is_block (ucs4_t uc, const uc_block_t *block);
609
610/* Get the list of all blocks.  */
611extern void
612       uc_all_blocks (const uc_block_t **blocks, size_t *count);
613
614/* ========================================================================= */
615
616/* Properties taken from language standards.  */
617
618/* Test whether a Unicode character is considered whitespace in ISO C 99.  */
619extern bool
620       uc_is_c_whitespace (ucs4_t uc);
621
622/* Test whether a Unicode character is considered whitespace in Java.  */
623extern bool
624       uc_is_java_whitespace (ucs4_t uc);
625
626enum
627{
628  UC_IDENTIFIER_START,    /* valid as first or subsequent character */
629  UC_IDENTIFIER_VALID,    /* valid as subsequent character only */
630  UC_IDENTIFIER_INVALID,  /* not valid */
631  UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
632};
633
634/* Return the categorization of a Unicode character w.r.t. the ISO C 99
635   identifier syntax.  */
636extern int
637       uc_c_ident_category (ucs4_t uc);
638
639/* Return the categorization of a Unicode character w.r.t. the Java
640   identifier syntax.  */
641extern int
642       uc_java_ident_category (ucs4_t uc);
643
644/* ========================================================================= */
645
646/* Like ISO C <ctype.h> and <wctype.h>.  These functions are deprecated,
647   because this set of functions was designed with ASCII in mind and cannot
648   reflect the more diverse reality of the Unicode character set.  But they
649   can be a quick-and-dirty porting aid when migrating from wchar_t APIs
650   to Unicode strings.  */
651
652/* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true.  */
653extern bool
654       uc_is_alnum (ucs4_t uc);
655
656/* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
657   or any character that is one of a locale-specific set of characters for
658   which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
659   is true.  */
660extern bool
661       uc_is_alpha (ucs4_t uc);
662
663/* Test for any control character.  */
664extern bool
665       uc_is_cntrl (ucs4_t uc);
666
667/* Test for any character that corresponds to a decimal-digit character.  */
668extern bool
669       uc_is_digit (ucs4_t uc);
670
671/* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
672   is false.  */
673extern bool
674       uc_is_graph (ucs4_t uc);
675
676/* Test for any character that corresponds to a lowercase letter or is one
677   of a locale-specific set of characters for which none of 'uc_is_cntrl',
678   'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true.  */
679extern bool
680       uc_is_lower (ucs4_t uc);
681
682/* Test for any printing character.  */
683extern bool
684       uc_is_print (ucs4_t uc);
685
686/* Test for any printing character that is one of a locale-specific set of
687   characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true.  */
688extern bool
689       uc_is_punct (ucs4_t uc);
690
691/* Test for any character that corresponds to a locale-specific set of
692   characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
693   is true.  */
694extern bool
695       uc_is_space (ucs4_t uc);
696
697/* Test for any character that corresponds to an uppercase letter or is one
698   of a locale-specific set of character for which none of 'uc_is_cntrl',
699   'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true.  */
700extern bool
701       uc_is_upper (ucs4_t uc);
702
703/* Test for any character that corresponds to a hexadecimal-digit
704   character.  */
705extern bool
706       uc_is_xdigit (ucs4_t uc);
707
708/* GNU extension. */
709/* Test for any character that corresponds to a standard blank character or
710   a locale-specific set of characters for which 'uc_is_alnum' is false.  */
711extern bool
712       uc_is_blank (ucs4_t uc);
713
714/* ========================================================================= */
715
716#ifdef __cplusplus
717}
718#endif
719
720#endif /* _UNICTYPE_H */
721