1#ifndef _UNICODE_CHAR_H_
2#define _UNICODE_CHAR_H_
3
4#include <SupportDefs.h>
5
6enum unicode_char_category
7{
8	// Non-category for unassigned and non-character code points.
9	B_UNICODE_UNASSIGNED				= 0,
10
11	B_UNICODE_UPPERCASE_LETTER			= 1,	// Lu
12	B_UNICODE_LOWERCASE_LETTER			= 2,	// Ll
13	B_UNICODE_TITLECASE_LETTER			= 3,	// Lt
14	B_UNICODE_MODIFIER_LETTER			= 4,	// Lm
15	B_UNICODE_OTHER_LETTER				= 5,	// Lo
16	B_UNICODE_NON_SPACING_MARK			= 6,	// Mn
17	B_UNICODE_ENCLOSING_MARK			= 7,	// Me
18	B_UNICODE_COMBINING_SPACING_MARK	= 8,	// Mc
19	B_UNICODE_DECIMAL_DIGIT_NUMBER		= 9,	// Nd
20	B_UNICODE_LETTER_NUMBER				= 10,	// Nl
21	B_UNICODE_OTHER_NUMBER				= 11,	// No
22	B_UNICODE_SPACE_SEPARATOR			= 12,	// Zs
23	B_UNICODE_LINE_SEPARATOR			= 13,	// Zl
24	B_UNICODE_PARAGRAPH_SEPARATOR		= 14,	// Zp
25	B_UNICODE_CONTROL_CHAR				= 15,	// Cc
26	B_UNICODE_FORMAT_CHAR				= 16,	// Cf
27	B_UNICODE_PRIVATE_USE_CHAR			= 17,	// Co
28	B_UNICODE_SURROGATE					= 18,	// Cs
29	B_UNICODE_DASH_PUNCTUATION			= 19,	// Pd
30	B_UNICODE_START_PUNCTUATION			= 20,	// Ps
31	B_UNICODE_END_PUNCTUATION			= 21,	// Pe
32	B_UNICODE_CONNECTOR_PUNCTUATION		= 22,	// Pc
33	B_UNICODE_OTHER_PUNCTUATION			= 23,	// Po
34	B_UNICODE_MATH_SYMBOL				= 24,	// Sm
35	B_UNICODE_CURRENCY_SYMBOL			= 25,	// Sc
36	B_UNICODE_MODIFIER_SYMBOL			= 26,	// Sk
37	B_UNICODE_OTHER_SYMBOL				= 27,	// So
38	B_UNICODE_INITIAL_PUNCTUATION		= 28,	// Pi
39	B_UNICODE_FINAL_PUNCTUATION			= 29,	// Pf
40	B_UNICODE_GENERAL_OTHER_TYPES		= 30,	// Cn
41
42	B_UNICODE_CATEGORY_COUNT
43};
44
45
46/**
47 * This specifies the language directional property of a character set.
48 */
49
50enum unicode_char_direction {
51	B_UNICODE_LEFT_TO_RIGHT               = 0,
52	B_UNICODE_RIGHT_TO_LEFT               = 1,
53	B_UNICODE_EUROPEAN_NUMBER             = 2,
54	B_UNICODE_EUROPEAN_NUMBER_SEPARATOR   = 3,
55	B_UNICODE_EUROPEAN_NUMBER_TERMINATOR  = 4,
56	B_UNICODE_ARABIC_NUMBER               = 5,
57	B_UNICODE_COMMON_NUMBER_SEPARATOR     = 6,
58	B_UNICODE_BLOCK_SEPARATOR             = 7,
59	B_UNICODE_SEGMENT_SEPARATOR           = 8,
60	B_UNICODE_WHITE_SPACE_NEUTRAL         = 9,
61	B_UNICODE_OTHER_NEUTRAL               = 10,
62	B_UNICODE_LEFT_TO_RIGHT_EMBEDDING     = 11,
63	B_UNICODE_LEFT_TO_RIGHT_OVERRIDE      = 12,
64	B_UNICODE_RIGHT_TO_LEFT_ARABIC        = 13,
65	B_UNICODE_RIGHT_TO_LEFT_EMBEDDING     = 14,
66	B_UNICODE_RIGHT_TO_LEFT_OVERRIDE      = 15,
67	B_UNICODE_POP_DIRECTIONAL_FORMAT      = 16,
68	B_UNICODE_DIR_NON_SPACING_MARK        = 17,
69	B_UNICODE_BOUNDARY_NEUTRAL            = 18,
70
71	B_UNICODE_DIRECTION_COUNT
72};
73
74
75/**
76 * Script range as defined in the Unicode standard.
77 */
78
79enum unicode_char_script {
80	// Script names
81	B_UNICODE_BASIC_LATIN,
82	B_UNICODE_LATIN_1_SUPPLEMENT,
83	B_UNICODE_LATIN_EXTENDED_A,
84	B_UNICODE_LATIN_EXTENDED_B,
85	B_UNICODE_IPA_EXTENSIONS,
86	B_UNICODE_SPACING_MODIFIER_LETTERS,
87	B_UNICODE_COMBINING_DIACRITICAL_MARKS,
88	B_UNICODE_GREEK,
89	B_UNICODE_CYRILLIC,
90	B_UNICODE_ARMENIAN,
91	B_UNICODE_HEBREW,
92	B_UNICODE_ARABIC,
93	B_UNICODE_SYRIAC,
94	B_UNICODE_THAANA,
95	B_UNICODE_DEVANAGARI,
96	B_UNICODE_BENGALI,
97	B_UNICODE_GURMUKHI,
98	B_UNICODE_GUJARATI,
99	B_UNICODE_ORIYA,
100	B_UNICODE_TAMIL,
101	B_UNICODE_TELUGU,
102	B_UNICODE_KANNADA,
103	B_UNICODE_MALAYALAM,
104	B_UNICODE_SINHALA,
105	B_UNICODE_THAI,
106	B_UNICODE_LAO,
107	B_UNICODE_TIBETAN,
108	B_UNICODE_MYANMAR,
109	B_UNICODE_GEORGIAN,
110	B_UNICODE_HANGUL_JAMO,
111	B_UNICODE_ETHIOPIC,
112	B_UNICODE_CHEROKEE,
113	B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
114	B_UNICODE_OGHAM,
115	B_UNICODE_RUNIC,
116	B_UNICODE_KHMER,
117	B_UNICODE_MONGOLIAN,
118	B_UNICODE_LATIN_EXTENDED_ADDITIONAL,
119	B_UNICODE_GREEK_EXTENDED,
120	B_UNICODE_GENERAL_PUNCTUATION,
121	B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS,
122	B_UNICODE_CURRENCY_SYMBOLS,
123	B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS,
124	B_UNICODE_LETTERLIKE_SYMBOLS,
125	B_UNICODE_NUMBER_FORMS,
126	B_UNICODE_ARROWS,
127	B_UNICODE_MATHEMATICAL_OPERATORS,
128	B_UNICODE_MISCELLANEOUS_TECHNICAL,
129	B_UNICODE_CONTROL_PICTURES,
130	B_UNICODE_OPTICAL_CHARACTER_RECOGNITION,
131	B_UNICODE_ENCLOSED_ALPHANUMERICS,
132	B_UNICODE_BOX_DRAWING,
133	B_UNICODE_BLOCK_ELEMENTS,
134	B_UNICODE_GEOMETRIC_SHAPES,
135	B_UNICODE_MISCELLANEOUS_SYMBOLS,
136	B_UNICODE_DINGBATS,
137	B_UNICODE_BRAILLE_PATTERNS,
138	B_UNICODE_CJK_RADICALS_SUPPLEMENT,
139	B_UNICODE_KANGXI_RADICALS,
140	B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
141	B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION,
142	B_UNICODE_HIRAGANA,
143	B_UNICODE_KATAKANA,
144	B_UNICODE_BOPOMOFO,
145	B_UNICODE_HANGUL_COMPATIBILITY_JAMO,
146	B_UNICODE_KANBUN,
147	B_UNICODE_BOPOMOFO_EXTENDED,
148	B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS,
149	B_UNICODE_CJK_COMPATIBILITY,
150	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
151	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS,
152	B_UNICODE_YI_SYLLABLES,
153	B_UNICODE_YI_RADICALS,
154	B_UNICODE_HANGUL_SYLLABLES,
155	B_UNICODE_HIGH_SURROGATES,
156	B_UNICODE_HIGH_PRIVATE_USE_SURROGATES,
157	B_UNICODE_LOW_SURROGATES,
158	B_UNICODE_PRIVATE_USE_AREA,
159	B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS,
160	B_UNICODE_ALPHABETIC_PRESENTATION_FORMS,
161	B_UNICODE_ARABIC_PRESENTATION_FORMS_A,
162	B_UNICODE_COMBINING_HALF_MARKS,
163	B_UNICODE_CJK_COMPATIBILITY_FORMS,
164	B_UNICODE_SMALL_FORM_VARIANTS,
165	B_UNICODE_ARABIC_PRESENTATION_FORMS_B,
166	B_UNICODE_SPECIALS,
167	B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS,
168
169	B_UNICODE_SCRIPT_COUNT,
170	B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT
171};
172
173
174/**
175 * Values returned by the u_getCellWidth() function.
176 */
177
178enum unicode_cell_width
179{
180    B_UNICODE_ZERO_WIDTH              = 0,
181    B_UNICODE_HALF_WIDTH              = 1,
182    B_UNICODE_FULL_WIDTH              = 2,
183    B_UNICODE_NEUTRAL_WIDTH           = 3,
184
185    B_UNICODE_CELL_WIDTH_COUNT
186};
187
188
189class BUnicodeChar {
190	public:
191		static bool IsAlpha(uint32 c);
192		static bool IsAlNum(uint32 c);
193		static bool IsDigit(uint32 c);
194		static bool IsHexDigit(uint32 c);
195		static bool IsUpper(uint32 c);
196		static bool IsLower(uint32 c);
197		static bool IsSpace(uint32 c);
198		static bool IsWhitespace(uint32 c);
199		static bool IsControl(uint32 c);
200		static bool IsPunctuation(uint32 c);
201		static bool IsPrintable(uint32 c);
202		static bool IsTitle(uint32 c);
203		static bool IsDefined(uint32 c);
204		static bool IsBase(uint32 c);
205
206		static int8 Type(uint32 c);
207
208		static uint32 ToLower(uint32 c);
209		static uint32 ToUpper(uint32 c);
210		static uint32 ToTitle(uint32 c);
211		static int32 DigitValue(uint32 c);
212
213		static void ToUTF8(uint32 c, char **out);
214		static uint32 FromUTF8(const char **in);
215		static uint32 FromUTF8(const char *in);
216
217		static size_t UTF8StringLength(const char *str);
218		static size_t UTF8StringLength(const char *str, size_t maxLength);
219
220	private:
221		BUnicodeChar();
222};
223
224
225inline uint32
226BUnicodeChar::FromUTF8(const char *in)
227{
228	const char *string = in;
229	return FromUTF8(&string);
230}
231
232
233#endif	/* _UNICODE_CHAR_H_ */
234