1/* 2 * Copyright 2003, Axel D��rfler, axeld@pinc-software.de. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Axel D��rfler, axeld@pinc-software.de 7 * Siarzhuk Zharski, zharik@gmx.li 8 * 9 */ 10 11 12#include <UnicodeChar.h> 13 14#include <unicode/uchar.h> 15#include <unicode/utf8.h> 16 17 18BUnicodeChar::BUnicodeChar() 19{ 20} 21 22 23// Returns the general category value for the code point. 24int8 25BUnicodeChar::Type(uint32 c) 26{ 27 return u_charType(c); 28} 29 30 31// Determines whether the specified code point is a letter character. 32// True for general categories "L" (letters). 33bool 34BUnicodeChar::IsAlpha(uint32 c) 35{ 36 return u_isalpha(c); 37} 38 39 40// Determines whether the specified code point is an alphanumeric character 41// (letter or digit). 42// True for characters with general categories 43// "L" (letters) and "Nd" (decimal digit numbers). 44bool 45BUnicodeChar::IsAlNum(uint32 c) 46{ 47 return u_isalnum(c); 48} 49 50 51// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). 52bool 53BUnicodeChar::IsLower(uint32 c) 54{ 55 return u_isULowercase(c); 56} 57 58 59// Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE). 60bool 61BUnicodeChar::IsUpper(uint32 c) 62{ 63 return u_isUUppercase(c); 64} 65 66 67// Determines whether the specified code point is a titlecase letter. 68// True for general category "Lt" (titlecase letter). 69bool 70BUnicodeChar::IsTitle(uint32 c) 71{ 72 return u_istitle(c); 73} 74 75 76// Determines whether the specified code point is a digit character. 77// True for characters with general category "Nd" (decimal digit numbers). 78// Beginning with Unicode 4, this is the same as 79// testing for the Numeric_Type of Decimal. 80bool 81BUnicodeChar::IsDigit(uint32 c) 82{ 83 return u_isdigit(c); 84} 85 86 87// Determines whether the specified code point is a hexadecimal digit. 88// This is equivalent to u_digit(c, 16)>=0. 89// True for characters with general category "Nd" (decimal digit numbers) 90// as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. 91// (That is, for letters with code points 92// 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) 93bool 94BUnicodeChar::IsHexDigit(uint32 c) 95{ 96 return u_isxdigit(c); 97} 98 99 100// Determines whether the specified code point is "defined", 101// which usually means that it is assigned a character. 102// True for general categories other than "Cn" (other, not assigned), 103// i.e., true for all code points mentioned in UnicodeData.txt. 104bool 105BUnicodeChar::IsDefined(uint32 c) 106{ 107 return u_isdefined(c); 108} 109 110 111// Determines whether the specified code point is a base character. 112// True for general categories "L" (letters), "N" (numbers), 113// "Mc" (spacing combining marks), and "Me" (enclosing marks). 114bool 115BUnicodeChar::IsBase(uint32 c) 116{ 117 return u_isbase(c); 118} 119 120 121// Determines whether the specified code point is a control character 122// (as defined by this function). 123// A control character is one of the following: 124// - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) 125// - U_CONTROL_CHAR (Cc) 126// - U_FORMAT_CHAR (Cf) 127// - U_LINE_SEPARATOR (Zl) 128// - U_PARAGRAPH_SEPARATOR (Zp) 129bool 130BUnicodeChar::IsControl(uint32 c) 131{ 132 return u_iscntrl(c); 133} 134 135 136// Determines whether the specified code point is a punctuation character. 137// True for characters with general categories "P" (punctuation). 138bool 139BUnicodeChar::IsPunctuation(uint32 c) 140{ 141 return u_ispunct(c); 142} 143 144 145// Determine if the specified code point is a space character according to Java. 146// True for characters with general categories "Z" (separators), 147// which does not include control codes (e.g., TAB or Line Feed). 148bool 149BUnicodeChar::IsSpace(uint32 c) 150{ 151 return u_isJavaSpaceChar(c); 152} 153 154 155// Determines if the specified code point is a whitespace character 156// A character is considered to be a whitespace character if and only 157// if it satisfies one of the following criteria: 158// - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), 159// but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space 160// or U+202F Narrow NBSP). 161// - It is U+0009 HORIZONTAL TABULATION. 162// - It is U+000A LINE FEED. 163// - It is U+000B VERTICAL TABULATION. 164// - It is U+000C FORM FEED. 165// - It is U+000D CARRIAGE RETURN. 166// - It is U+001C FILE SEPARATOR. 167// - It is U+001D GROUP SEPARATOR. 168// - It is U+001E RECORD SEPARATOR. 169// - It is U+001F UNIT SEPARATOR. 170bool 171BUnicodeChar::IsWhitespace(uint32 c) 172{ 173 return u_isWhitespace(c); 174} 175 176 177// Determines whether the specified code point is a printable character. 178// True for general categories other than "C" (controls). 179bool 180BUnicodeChar::IsPrintable(uint32 c) 181{ 182 return u_isprint(c); 183} 184 185 186// #pragma mark - 187 188uint32 189BUnicodeChar::ToLower(uint32 c) 190{ 191 return u_tolower(c); 192} 193 194 195uint32 196BUnicodeChar::ToUpper(uint32 c) 197{ 198 return u_toupper(c); 199} 200 201 202uint32 203BUnicodeChar::ToTitle(uint32 c) 204{ 205 return u_totitle(c); 206} 207 208 209int32 210BUnicodeChar::DigitValue(uint32 c) 211{ 212 return u_digit(c, 10); 213} 214 215 216unicode_east_asian_width 217BUnicodeChar::EastAsianWidth(uint32 c) 218{ 219 return (unicode_east_asian_width)u_getIntPropertyValue(c, 220 UCHAR_EAST_ASIAN_WIDTH); 221} 222 223 224void 225BUnicodeChar::ToUTF8(uint32 c, char** out) 226{ 227 int i = 0; 228 U8_APPEND_UNSAFE(*out, i, c); 229 *out += i; 230} 231 232 233uint32 234BUnicodeChar::FromUTF8(const char** in) 235{ 236 int i = 0; 237 uint32 c = 0; 238 U8_NEXT_UNSAFE(*in, i, c); 239 *in += i; 240 241 return c; 242} 243 244 245size_t 246BUnicodeChar::UTF8StringLength(const char* string) 247{ 248 size_t len = 0; 249 while (*string) { 250 FromUTF8(&string); 251 len++; 252 } 253 return len; 254} 255 256 257size_t 258BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength) 259{ 260 size_t len = 0; 261 while (len < maxLength && *string) { 262 FromUTF8(&string); 263 len++; 264 } 265 return len; 266} 267