1/* Unicode character classification and properties. 2 Copyright (C) 2002, 2005-2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify it 5 under the terms of the GNU Lesser General Public License as published 6 by the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17#ifndef _UNICTYPE_H 18#define _UNICTYPE_H 19 20#include "unitypes.h" 21 22/* Get LIBUNISTRING_DLL_VARIABLE. */ 23#include <unistring/woe32dll.h> 24 25/* Get bool. */ 26#include <unistring/stdbool.h> 27 28/* Get size_t. */ 29#include <stddef.h> 30 31#ifdef __cplusplus 32extern "C" { 33#endif 34 35/* ========================================================================= */ 36 37/* Field 1 of Unicode Character Database: Character name. 38 See "uniname.h". */ 39 40/* ========================================================================= */ 41 42/* Field 2 of Unicode Character Database: General category. */ 43 44/* Data type denoting a General category value. This is not just a bitmask, 45 but rather a bitmask and a pointer to the lookup table, so that programs 46 that use only the predefined bitmasks (i.e. don't combine bitmasks with & 47 and |) don't have a link-time dependency towards the big general table. */ 48typedef struct 49{ 50 uint32_t bitmask : 31; 51 /*bool*/ unsigned int generic : 1; 52 union 53 { 54 const void *table; /* when generic is 0 */ 55 bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */ 56 } lookup; 57} 58uc_general_category_t; 59 60/* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html 61 says a 32-bit integer will always suffice to represent them. 62 These bit masks can only be used with the uc_is_general_category_withtable 63 function. */ 64enum 65{ 66 UC_CATEGORY_MASK_L = 0x0000001f, 67 UC_CATEGORY_MASK_Lu = 0x00000001, 68 UC_CATEGORY_MASK_Ll = 0x00000002, 69 UC_CATEGORY_MASK_Lt = 0x00000004, 70 UC_CATEGORY_MASK_Lm = 0x00000008, 71 UC_CATEGORY_MASK_Lo = 0x00000010, 72 UC_CATEGORY_MASK_M = 0x000000e0, 73 UC_CATEGORY_MASK_Mn = 0x00000020, 74 UC_CATEGORY_MASK_Mc = 0x00000040, 75 UC_CATEGORY_MASK_Me = 0x00000080, 76 UC_CATEGORY_MASK_N = 0x00000700, 77 UC_CATEGORY_MASK_Nd = 0x00000100, 78 UC_CATEGORY_MASK_Nl = 0x00000200, 79 UC_CATEGORY_MASK_No = 0x00000400, 80 UC_CATEGORY_MASK_P = 0x0003f800, 81 UC_CATEGORY_MASK_Pc = 0x00000800, 82 UC_CATEGORY_MASK_Pd = 0x00001000, 83 UC_CATEGORY_MASK_Ps = 0x00002000, 84 UC_CATEGORY_MASK_Pe = 0x00004000, 85 UC_CATEGORY_MASK_Pi = 0x00008000, 86 UC_CATEGORY_MASK_Pf = 0x00010000, 87 UC_CATEGORY_MASK_Po = 0x00020000, 88 UC_CATEGORY_MASK_S = 0x003c0000, 89 UC_CATEGORY_MASK_Sm = 0x00040000, 90 UC_CATEGORY_MASK_Sc = 0x00080000, 91 UC_CATEGORY_MASK_Sk = 0x00100000, 92 UC_CATEGORY_MASK_So = 0x00200000, 93 UC_CATEGORY_MASK_Z = 0x01c00000, 94 UC_CATEGORY_MASK_Zs = 0x00400000, 95 UC_CATEGORY_MASK_Zl = 0x00800000, 96 UC_CATEGORY_MASK_Zp = 0x01000000, 97 UC_CATEGORY_MASK_C = 0x3e000000, 98 UC_CATEGORY_MASK_Cc = 0x02000000, 99 UC_CATEGORY_MASK_Cf = 0x04000000, 100 UC_CATEGORY_MASK_Cs = 0x08000000, 101 UC_CATEGORY_MASK_Co = 0x10000000, 102 UC_CATEGORY_MASK_Cn = 0x20000000 103}; 104 105/* Predefined General category values. */ 106extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_L; 107extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lu; 108extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ll; 109extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lt; 110extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lm; 111extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lo; 112extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_M; 113extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mn; 114extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mc; 115extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Me; 116extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_N; 117extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nd; 118extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nl; 119extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_No; 120extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_P; 121extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pc; 122extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pd; 123extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ps; 124extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pe; 125extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pi; 126extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pf; 127extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Po; 128extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_S; 129extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sm; 130extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sc; 131extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sk; 132extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_So; 133extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Z; 134extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zs; 135extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zl; 136extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zp; 137extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_C; 138extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cc; 139extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cf; 140extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cs; 141extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Co; 142extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cn; 143/* Non-public. */ 144extern const uc_general_category_t _UC_CATEGORY_NONE; 145 146/* Alias names for predefined General category values. */ 147#define UC_LETTER UC_CATEGORY_L 148#define UC_UPPERCASE_LETTER UC_CATEGORY_Lu 149#define UC_LOWERCASE_LETTER UC_CATEGORY_Ll 150#define UC_TITLECASE_LETTER UC_CATEGORY_Lt 151#define UC_MODIFIER_LETTER UC_CATEGORY_Lm 152#define UC_OTHER_LETTER UC_CATEGORY_Lo 153#define UC_MARK UC_CATEGORY_M 154#define UC_NON_SPACING_MARK UC_CATEGORY_Mn 155#define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc 156#define UC_ENCLOSING_MARK UC_CATEGORY_Me 157#define UC_NUMBER UC_CATEGORY_N 158#define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd 159#define UC_LETTER_NUMBER UC_CATEGORY_Nl 160#define UC_OTHER_NUMBER UC_CATEGORY_No 161#define UC_PUNCTUATION UC_CATEGORY_P 162#define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc 163#define UC_DASH_PUNCTUATION UC_CATEGORY_Pd 164#define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */ 165#define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */ 166#define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi 167#define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf 168#define UC_OTHER_PUNCTUATION UC_CATEGORY_Po 169#define UC_SYMBOL UC_CATEGORY_S 170#define UC_MATH_SYMBOL UC_CATEGORY_Sm 171#define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc 172#define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk 173#define UC_OTHER_SYMBOL UC_CATEGORY_So 174#define UC_SEPARATOR UC_CATEGORY_Z 175#define UC_SPACE_SEPARATOR UC_CATEGORY_Zs 176#define UC_LINE_SEPARATOR UC_CATEGORY_Zl 177#define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp 178#define UC_OTHER UC_CATEGORY_C 179#define UC_CONTROL UC_CATEGORY_Cc 180#define UC_FORMAT UC_CATEGORY_Cf 181#define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */ 182#define UC_PRIVATE_USE UC_CATEGORY_Co 183#define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */ 184 185/* Return the union of two general categories. 186 This corresponds to the unions of the two sets of characters. */ 187extern uc_general_category_t 188 uc_general_category_or (uc_general_category_t category1, 189 uc_general_category_t category2); 190 191/* Return the intersection of two general categories as bit masks. 192 This *does*not* correspond to the intersection of the two sets of 193 characters. */ 194extern uc_general_category_t 195 uc_general_category_and (uc_general_category_t category1, 196 uc_general_category_t category2); 197 198/* Return the intersection of a general category with the complement of a 199 second general category, as bit masks. 200 This *does*not* correspond to the intersection with complement, when 201 viewing the categories as sets of characters. */ 202extern uc_general_category_t 203 uc_general_category_and_not (uc_general_category_t category1, 204 uc_general_category_t category2); 205 206/* Return the name of a general category. */ 207extern const char * 208 uc_general_category_name (uc_general_category_t category); 209 210/* Return the general category given by name, e.g. "Lu". */ 211extern uc_general_category_t 212 uc_general_category_byname (const char *category_name); 213 214/* Return the general category of a Unicode character. */ 215extern uc_general_category_t 216 uc_general_category (ucs4_t uc); 217 218/* Test whether a Unicode character belongs to a given category. 219 The CATEGORY argument can be the combination of several predefined 220 general categories. */ 221extern bool 222 uc_is_general_category (ucs4_t uc, uc_general_category_t category); 223/* Likewise. This function uses a big table comprising all categories. */ 224extern bool 225 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask); 226 227/* ========================================================================= */ 228 229/* Field 3 of Unicode Character Database: Canonical combining class. */ 230 231/* The possible results of uc_combining_class (0..255) are described in 232 UCD.html. The list here is not definitive; more values can be added 233 in future versions. */ 234enum 235{ 236 UC_CCC_NR = 0, /* Not Reordered */ 237 UC_CCC_OV = 1, /* Overlay */ 238 UC_CCC_NK = 7, /* Nukta */ 239 UC_CCC_KV = 8, /* Kana Voicing */ 240 UC_CCC_VR = 9, /* Virama */ 241 UC_CCC_ATBL = 200, /* Attached Below Left */ 242 UC_CCC_ATB = 202, /* Attached Below */ 243 UC_CCC_ATAR = 216, /* Attached Above Right */ 244 UC_CCC_BL = 218, /* Below Left */ 245 UC_CCC_B = 220, /* Below */ 246 UC_CCC_BR = 222, /* Below Right */ 247 UC_CCC_L = 224, /* Left */ 248 UC_CCC_R = 226, /* Right */ 249 UC_CCC_AL = 228, /* Above Left */ 250 UC_CCC_A = 230, /* Above */ 251 UC_CCC_AR = 232, /* Above Right */ 252 UC_CCC_DB = 233, /* Double Below */ 253 UC_CCC_DA = 234, /* Double Above */ 254 UC_CCC_IS = 240 /* Iota Subscript */ 255}; 256 257/* Return the canonical combining class of a Unicode character. */ 258extern int 259 uc_combining_class (ucs4_t uc); 260 261/* ========================================================================= */ 262 263/* Field 4 of Unicode Character Database: Bidirectional category. */ 264 265enum 266{ 267 UC_BIDI_L, /* Left-to-Right */ 268 UC_BIDI_LRE, /* Left-to-Right Embedding */ 269 UC_BIDI_LRO, /* Left-to-Right Override */ 270 UC_BIDI_R, /* Right-to-Left */ 271 UC_BIDI_AL, /* Right-to-Left Arabic */ 272 UC_BIDI_RLE, /* Right-to-Left Embedding */ 273 UC_BIDI_RLO, /* Right-to-Left Override */ 274 UC_BIDI_PDF, /* Pop Directional Format */ 275 UC_BIDI_EN, /* European Number */ 276 UC_BIDI_ES, /* European Number Separator */ 277 UC_BIDI_ET, /* European Number Terminator */ 278 UC_BIDI_AN, /* Arabic Number */ 279 UC_BIDI_CS, /* Common Number Separator */ 280 UC_BIDI_NSM, /* Non-Spacing Mark */ 281 UC_BIDI_BN, /* Boundary Neutral */ 282 UC_BIDI_B, /* Paragraph Separator */ 283 UC_BIDI_S, /* Segment Separator */ 284 UC_BIDI_WS, /* Whitespace */ 285 UC_BIDI_ON /* Other Neutral */ 286}; 287 288/* Return the name of a bidirectional category. */ 289extern const char * 290 uc_bidi_category_name (int category); 291 292/* Return the bidirectional category given by name, e.g. "LRE". */ 293extern int 294 uc_bidi_category_byname (const char *category_name); 295 296/* Return the bidirectional category of a Unicode character. */ 297extern int 298 uc_bidi_category (ucs4_t uc); 299 300/* Test whether a Unicode character belongs to a given bidirectional 301 category. */ 302extern bool 303 uc_is_bidi_category (ucs4_t uc, int category); 304 305/* ========================================================================= */ 306 307/* Field 5 of Unicode Character Database: Character decomposition mapping. 308 See "uninorm.h". */ 309 310/* ========================================================================= */ 311 312/* Field 6 of Unicode Character Database: Decimal digit value. */ 313 314/* Return the decimal digit value of a Unicode character. */ 315extern int 316 uc_decimal_value (ucs4_t uc); 317 318/* ========================================================================= */ 319 320/* Field 7 of Unicode Character Database: Digit value. */ 321 322/* Return the digit value of a Unicode character. */ 323extern int 324 uc_digit_value (ucs4_t uc); 325 326/* ========================================================================= */ 327 328/* Field 8 of Unicode Character Database: Numeric value. */ 329 330/* Return the numeric value of a Unicode character. */ 331typedef struct 332{ 333 int numerator; 334 int denominator; 335} 336uc_fraction_t; 337extern uc_fraction_t 338 uc_numeric_value (ucs4_t uc); 339 340/* ========================================================================= */ 341 342/* Field 9 of Unicode Character Database: Mirrored. */ 343 344/* Return the mirrored character of a Unicode character UC in *PUC. */ 345extern bool 346 uc_mirror_char (ucs4_t uc, ucs4_t *puc); 347 348/* ========================================================================= */ 349 350/* Field 10 of Unicode Character Database: Unicode 1.0 Name. 351 Not available in this library. */ 352 353/* ========================================================================= */ 354 355/* Field 11 of Unicode Character Database: ISO 10646 comment. 356 Not available in this library. */ 357 358/* ========================================================================= */ 359 360/* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping, 361 lowercase mapping, titlecase mapping. See "unicase.h". */ 362 363/* ========================================================================= */ 364 365/* Common API for properties. */ 366 367/* Data type denoting a property. This is not just a number, but rather a 368 pointer to the test functions, so that programs that use only few of the 369 properties don't have a link-time dependency towards all the tables. */ 370typedef struct 371{ 372 bool (*test_fn) (ucs4_t uc); 373} 374uc_property_t; 375 376/* Predefined properties. */ 377/* General. */ 378extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_WHITE_SPACE; 379extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ALPHABETIC; 380extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC; 381extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NOT_A_CHARACTER; 382extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT; 383extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; 384extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEPRECATED; 385extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION; 386extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_VARIATION_SELECTOR; 387extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PRIVATE_USE; 388extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE; 389/* Case. */ 390extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UPPERCASE; 391extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_UPPERCASE; 392extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOWERCASE; 393extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_LOWERCASE; 394extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TITLECASE; 395extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SOFT_DOTTED; 396/* Identifiers. */ 397extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_START; 398extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_START; 399extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_CONTINUE; 400extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE; 401extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_START; 402extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_CONTINUE; 403extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE; 404extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_SYNTAX; 405/* Shaping and rendering. */ 406extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_JOIN_CONTROL; 407extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_BASE; 408extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND; 409extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND; 410extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_LINK; 411/* Bidi. */ 412extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_CONTROL; 413extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT; 414extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT; 415extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT; 416extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT; 417extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR; 418extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR; 419extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT; 420extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR; 421extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR; 422extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR; 423extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_WHITESPACE; 424extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK; 425extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL; 426extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_PDF; 427extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE; 428extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL; 429/* Numeric. */ 430extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HEX_DIGIT; 431extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT; 432/* CJK. */ 433extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDEOGRAPHIC; 434extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH; 435extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_RADICAL; 436extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR; 437extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR; 438/* Misc. */ 439extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ZERO_WIDTH; 440extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SPACE; 441extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NON_BREAK; 442extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ISO_CONTROL; 443extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_FORMAT_CONTROL; 444extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DASH; 445extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HYPHEN; 446extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PUNCTUATION; 447extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LINE_SEPARATOR; 448extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR; 449extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_QUOTATION_MARK; 450extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL; 451extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION; 452extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL; 453extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_MATH; 454extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_MATH; 455extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION; 456extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LEFT_OF_PAIR; 457extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMBINING; 458extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMPOSITE; 459extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DECIMAL_DIGIT; 460extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NUMERIC; 461extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DIACRITIC; 462extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_EXTENDER; 463extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL; 464 465/* Return the property given by name, e.g. "White space". */ 466extern uc_property_t 467 uc_property_byname (const char *property_name); 468 469/* Test whether a property is valid. */ 470#define uc_property_is_valid(property) ((property).test_fn != NULL) 471 472/* Test whether a Unicode character has a given property. */ 473extern bool 474 uc_is_property (ucs4_t uc, uc_property_t property); 475extern bool uc_is_property_white_space (ucs4_t uc); 476extern bool uc_is_property_alphabetic (ucs4_t uc); 477extern bool uc_is_property_other_alphabetic (ucs4_t uc); 478extern bool uc_is_property_not_a_character (ucs4_t uc); 479extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc); 480extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc); 481extern bool uc_is_property_deprecated (ucs4_t uc); 482extern bool uc_is_property_logical_order_exception (ucs4_t uc); 483extern bool uc_is_property_variation_selector (ucs4_t uc); 484extern bool uc_is_property_private_use (ucs4_t uc); 485extern bool uc_is_property_unassigned_code_value (ucs4_t uc); 486extern bool uc_is_property_uppercase (ucs4_t uc); 487extern bool uc_is_property_other_uppercase (ucs4_t uc); 488extern bool uc_is_property_lowercase (ucs4_t uc); 489extern bool uc_is_property_other_lowercase (ucs4_t uc); 490extern bool uc_is_property_titlecase (ucs4_t uc); 491extern bool uc_is_property_soft_dotted (ucs4_t uc); 492extern bool uc_is_property_id_start (ucs4_t uc); 493extern bool uc_is_property_other_id_start (ucs4_t uc); 494extern bool uc_is_property_id_continue (ucs4_t uc); 495extern bool uc_is_property_other_id_continue (ucs4_t uc); 496extern bool uc_is_property_xid_start (ucs4_t uc); 497extern bool uc_is_property_xid_continue (ucs4_t uc); 498extern bool uc_is_property_pattern_white_space (ucs4_t uc); 499extern bool uc_is_property_pattern_syntax (ucs4_t uc); 500extern bool uc_is_property_join_control (ucs4_t uc); 501extern bool uc_is_property_grapheme_base (ucs4_t uc); 502extern bool uc_is_property_grapheme_extend (ucs4_t uc); 503extern bool uc_is_property_other_grapheme_extend (ucs4_t uc); 504extern bool uc_is_property_grapheme_link (ucs4_t uc); 505extern bool uc_is_property_bidi_control (ucs4_t uc); 506extern bool uc_is_property_bidi_left_to_right (ucs4_t uc); 507extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc); 508extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc); 509extern bool uc_is_property_bidi_european_digit (ucs4_t uc); 510extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc); 511extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc); 512extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc); 513extern bool uc_is_property_bidi_common_separator (ucs4_t uc); 514extern bool uc_is_property_bidi_block_separator (ucs4_t uc); 515extern bool uc_is_property_bidi_segment_separator (ucs4_t uc); 516extern bool uc_is_property_bidi_whitespace (ucs4_t uc); 517extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc); 518extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc); 519extern bool uc_is_property_bidi_pdf (ucs4_t uc); 520extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc); 521extern bool uc_is_property_bidi_other_neutral (ucs4_t uc); 522extern bool uc_is_property_hex_digit (ucs4_t uc); 523extern bool uc_is_property_ascii_hex_digit (ucs4_t uc); 524extern bool uc_is_property_ideographic (ucs4_t uc); 525extern bool uc_is_property_unified_ideograph (ucs4_t uc); 526extern bool uc_is_property_radical (ucs4_t uc); 527extern bool uc_is_property_ids_binary_operator (ucs4_t uc); 528extern bool uc_is_property_ids_trinary_operator (ucs4_t uc); 529extern bool uc_is_property_zero_width (ucs4_t uc); 530extern bool uc_is_property_space (ucs4_t uc); 531extern bool uc_is_property_non_break (ucs4_t uc); 532extern bool uc_is_property_iso_control (ucs4_t uc); 533extern bool uc_is_property_format_control (ucs4_t uc); 534extern bool uc_is_property_dash (ucs4_t uc); 535extern bool uc_is_property_hyphen (ucs4_t uc); 536extern bool uc_is_property_punctuation (ucs4_t uc); 537extern bool uc_is_property_line_separator (ucs4_t uc); 538extern bool uc_is_property_paragraph_separator (ucs4_t uc); 539extern bool uc_is_property_quotation_mark (ucs4_t uc); 540extern bool uc_is_property_sentence_terminal (ucs4_t uc); 541extern bool uc_is_property_terminal_punctuation (ucs4_t uc); 542extern bool uc_is_property_currency_symbol (ucs4_t uc); 543extern bool uc_is_property_math (ucs4_t uc); 544extern bool uc_is_property_other_math (ucs4_t uc); 545extern bool uc_is_property_paired_punctuation (ucs4_t uc); 546extern bool uc_is_property_left_of_pair (ucs4_t uc); 547extern bool uc_is_property_combining (ucs4_t uc); 548extern bool uc_is_property_composite (ucs4_t uc); 549extern bool uc_is_property_decimal_digit (ucs4_t uc); 550extern bool uc_is_property_numeric (ucs4_t uc); 551extern bool uc_is_property_diacritic (ucs4_t uc); 552extern bool uc_is_property_extender (ucs4_t uc); 553extern bool uc_is_property_ignorable_control (ucs4_t uc); 554 555/* ========================================================================= */ 556 557/* Subdivision of the Unicode characters into scripts. */ 558 559typedef struct 560{ 561 unsigned int code : 21; 562 unsigned int start : 1; 563 unsigned int end : 1; 564} 565uc_interval_t; 566typedef struct 567{ 568 unsigned int nintervals; 569 const uc_interval_t *intervals; 570 const char *name; 571} 572uc_script_t; 573 574/* Return the script of a Unicode character. */ 575extern const uc_script_t * 576 uc_script (ucs4_t uc); 577 578/* Return the script given by name, e.g. "HAN". */ 579extern const uc_script_t * 580 uc_script_byname (const char *script_name); 581 582/* Test whether a Unicode character belongs to a given script. */ 583extern bool 584 uc_is_script (ucs4_t uc, const uc_script_t *script); 585 586/* Get the list of all scripts. */ 587extern void 588 uc_all_scripts (const uc_script_t **scripts, size_t *count); 589 590/* ========================================================================= */ 591 592/* Subdivision of the Unicode character range into blocks. */ 593 594typedef struct 595{ 596 ucs4_t start; 597 ucs4_t end; 598 const char *name; 599} 600uc_block_t; 601 602/* Return the block a character belongs to. */ 603extern const uc_block_t * 604 uc_block (ucs4_t uc); 605 606/* Test whether a Unicode character belongs to a given block. */ 607extern bool 608 uc_is_block (ucs4_t uc, const uc_block_t *block); 609 610/* Get the list of all blocks. */ 611extern void 612 uc_all_blocks (const uc_block_t **blocks, size_t *count); 613 614/* ========================================================================= */ 615 616/* Properties taken from language standards. */ 617 618/* Test whether a Unicode character is considered whitespace in ISO C 99. */ 619extern bool 620 uc_is_c_whitespace (ucs4_t uc); 621 622/* Test whether a Unicode character is considered whitespace in Java. */ 623extern bool 624 uc_is_java_whitespace (ucs4_t uc); 625 626enum 627{ 628 UC_IDENTIFIER_START, /* valid as first or subsequent character */ 629 UC_IDENTIFIER_VALID, /* valid as subsequent character only */ 630 UC_IDENTIFIER_INVALID, /* not valid */ 631 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */ 632}; 633 634/* Return the categorization of a Unicode character w.r.t. the ISO C 99 635 identifier syntax. */ 636extern int 637 uc_c_ident_category (ucs4_t uc); 638 639/* Return the categorization of a Unicode character w.r.t. the Java 640 identifier syntax. */ 641extern int 642 uc_java_ident_category (ucs4_t uc); 643 644/* ========================================================================= */ 645 646/* Like ISO C <ctype.h> and <wctype.h>. These functions are deprecated, 647 because this set of functions was designed with ASCII in mind and cannot 648 reflect the more diverse reality of the Unicode character set. But they 649 can be a quick-and-dirty porting aid when migrating from wchar_t APIs 650 to Unicode strings. */ 651 652/* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */ 653extern bool 654 uc_is_alnum (ucs4_t uc); 655 656/* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true, 657 or any character that is one of a locale-specific set of characters for 658 which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' 659 is true. */ 660extern bool 661 uc_is_alpha (ucs4_t uc); 662 663/* Test for any control character. */ 664extern bool 665 uc_is_cntrl (ucs4_t uc); 666 667/* Test for any character that corresponds to a decimal-digit character. */ 668extern bool 669 uc_is_digit (ucs4_t uc); 670 671/* Test for any character for which 'uc_is_print' is true and 'uc_is_space' 672 is false. */ 673extern bool 674 uc_is_graph (ucs4_t uc); 675 676/* Test for any character that corresponds to a lowercase letter or is one 677 of a locale-specific set of characters for which none of 'uc_is_cntrl', 678 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */ 679extern bool 680 uc_is_lower (ucs4_t uc); 681 682/* Test for any printing character. */ 683extern bool 684 uc_is_print (ucs4_t uc); 685 686/* Test for any printing character that is one of a locale-specific set of 687 characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */ 688extern bool 689 uc_is_punct (ucs4_t uc); 690 691/* Test for any character that corresponds to a locale-specific set of 692 characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct' 693 is true. */ 694extern bool 695 uc_is_space (ucs4_t uc); 696 697/* Test for any character that corresponds to an uppercase letter or is one 698 of a locale-specific set of character for which none of 'uc_is_cntrl', 699 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */ 700extern bool 701 uc_is_upper (ucs4_t uc); 702 703/* Test for any character that corresponds to a hexadecimal-digit 704 character. */ 705extern bool 706 uc_is_xdigit (ucs4_t uc); 707 708/* GNU extension. */ 709/* Test for any character that corresponds to a standard blank character or 710 a locale-specific set of characters for which 'uc_is_alnum' is false. */ 711extern bool 712 uc_is_blank (ucs4_t uc); 713 714/* ========================================================================= */ 715 716#ifdef __cplusplus 717} 718#endif 719 720#endif /* _UNICTYPE_H */ 721