1/* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_CONVERSION 11 12#include "csmatch.h" 13#include "csrmbcs.h" 14 15#include <math.h> 16 17U_NAMESPACE_BEGIN 18 19#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 20 21#define min(x,y) (((x)<(y))?(x):(y)) 22 23static const uint16_t commonChars_sjis [] = { 24// TODO: This set of data comes from the character frequency- 25// of-occurence analysis tool. The data needs to be moved 26// into a resource and loaded from there. 270x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 280x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 290x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 300x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 310x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 320x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 33 34static const uint16_t commonChars_euc_jp[] = { 35// TODO: This set of data comes from the character frequency- 36// of-occurence analysis tool. The data needs to be moved 37// into a resource and loaded from there. 380xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 390xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 400xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 410xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 420xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 430xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 440xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 450xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 460xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 470xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 48 49static const uint16_t commonChars_euc_kr[] = { 50// TODO: This set of data comes from the character frequency- 51// of-occurence analysis tool. The data needs to be moved 52// into a resource and loaded from there. 530xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 540xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 550xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 560xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 570xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 580xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 590xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 600xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 610xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 620xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 63 64static const uint16_t commonChars_big5[] = { 65// TODO: This set of data comes from the character frequency- 66// of-occurence analysis tool. The data needs to be moved 67// into a resource and loaded from there. 680xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 690xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 700xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 710xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 720xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 730xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 740xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 750xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 760xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 770xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 78 79static const uint16_t commonChars_gb_18030[] = { 80// TODO: This set of data comes from the character frequency- 81// of-occurence analysis tool. The data needs to be moved 82// into a resource and loaded from there. 830xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 840xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 850xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 860xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 870xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 880xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 890xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 900xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 910xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 920xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 93 94#if U_PLATFORM_IS_DARWIN_BASED 95static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = { 96 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ... 97 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward 98 {0} 99}; 100static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = { 101 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ... 102 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward 103 {0} 104}; 105static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = { 106 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1 107 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2 108 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward 109 {0} 110}; 111static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = { 112 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1 113 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2 114 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward 115 {0} 116}; 117static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = { 118 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP... 119 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward 120 {0} 121}; 122#endif 123 124static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 125{ 126 int32_t start = 0, end = len-1; 127 int32_t mid = (start+end)/2; 128 129 while(start <= end) { 130 if(array[mid] == value) { 131 return mid; 132 } 133 134 if(array[mid] < value){ 135 start = mid+1; 136 } else { 137 end = mid-1; 138 } 139 140 mid = (start+end)/2; 141 } 142 143 return -1; 144} 145 146#if U_PLATFORM_IS_DARWIN_BASED 147// If testPrefix is a prefix of base, return its length, else return 0 148static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) { 149 const uint8_t *testPrefixStart = testPrefix; 150 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) { 151 testPrefix++; 152 base++; 153 } 154 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0; 155} 156#endif 157 158IteratedChar::IteratedChar() : 159charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) 160{ 161 // nothing else to do. 162} 163 164/*void IteratedChar::reset() 165{ 166 charValue = 0; 167 index = -1; 168 nextIndex = 0; 169 error = FALSE; 170 done = FALSE; 171}*/ 172 173int32_t IteratedChar::nextByte(InputText *det) 174{ 175 if (nextIndex >= det->fRawLength) { 176 done = TRUE; 177 178 return -1; 179 } 180 181 return det->fRawInput[nextIndex++]; 182} 183 184CharsetRecog_mbcs::~CharsetRecog_mbcs() 185{ 186 // nothing to do. 187} 188 189#if U_PLATFORM_IS_DARWIN_BASED 190int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const { 191#else 192int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 193#endif 194 int32_t singleByteCharCount = 0; 195 int32_t doubleByteCharCount = 0; 196 int32_t commonCharCount = 0; 197 int32_t badCharCount = 0; 198 int32_t totalCharCount = 0; 199 int32_t confidence = 0; 200#if U_PLATFORM_IS_DARWIN_BASED 201 int32_t confidenceFromKeys = 0; 202#endif 203 IteratedChar iter; 204 205 while (nextChar(&iter, det)) { 206 totalCharCount++; 207 208 if (iter.error) { 209 badCharCount++; 210 } else { 211 if (iter.charValue <= 0xFF) { 212 singleByteCharCount++; 213 } else { 214 doubleByteCharCount++; 215 216 if (commonChars != 0) { 217 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ 218 commonCharCount += 1; 219 } 220 } 221#if U_PLATFORM_IS_DARWIN_BASED 222 if (doubleByteCharCount <= 20) { 223 int32_t keyIndex; 224 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) { 225 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]); 226 confidenceFromKeys += prefixLen*5; 227 } 228 } 229#endif 230 } 231 } 232 233 234 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 235 // Bail out early if the byte data is not matching the encoding scheme. 236 // break detectBlock; 237 return confidence; 238 } 239 } 240 241 if (doubleByteCharCount <= 10 && badCharCount == 0) { 242 // Not many multi-byte chars. 243 if (doubleByteCharCount == 0 && totalCharCount < 10) { 244 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 245 // We don't have enough data to have any confidence. 246 // Statistical analysis of single byte non-ASCII charcters would probably help here. 247 confidence = 0; 248 } 249 else { 250 // ASCII or ISO file? It's probably not our encoding, 251 // but is not incompatible with our encoding, so don't give it a zero. 252#if U_PLATFORM_IS_DARWIN_BASED 253 if (confidenceFromKeys > 90) { 254 confidenceFromKeys = 90; 255 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) { 256 confidenceFromKeys += 20; 257 } 258 confidence = 10 + confidenceFromKeys; 259#else 260 confidence = 10; 261#endif 262 } 263 264 return confidence; 265 } 266 267 // 268 // No match if there are too many characters that don't fit the encoding scheme. 269 // (should we have zero tolerance for these?) 270 // 271 if (doubleByteCharCount < 20*badCharCount) { 272 confidence = 0; 273 274 return confidence; 275 } 276 277 if (commonChars == 0) { 278 // We have no statistics on frequently occuring characters. 279 // Assess confidence purely on having a reasonable number of 280 // multi-byte characters (the more the better) 281 confidence = 30 + doubleByteCharCount - 20*badCharCount; 282#if U_PLATFORM_IS_DARWIN_BASED 283 confidence += confidenceFromKeys; 284#endif 285 286 if (confidence > 100) { 287 confidence = 100; 288 } 289 } else { 290 // 291 // Frequency of occurence statistics exist. 292 // 293 294 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ 295 double scaleFactor = 90.0 / maxVal; 296 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); 297#if U_PLATFORM_IS_DARWIN_BASED 298 confidence += confidenceFromKeys; 299#endif 300 301 confidence = min(confidence, 100); 302 } 303 304 if (confidence < 0) { 305 confidence = 0; 306 } 307 308 return confidence; 309} 310 311CharsetRecog_sjis::~CharsetRecog_sjis() 312{ 313 // nothing to do 314} 315 316UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 317 it->index = it->nextIndex; 318 it->error = FALSE; 319 320 int32_t firstByte = it->charValue = it->nextByte(det); 321 322 if (firstByte < 0) { 323 return FALSE; 324 } 325 326 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 327 return TRUE; 328 } 329 330 int32_t secondByte = it->nextByte(det); 331 if (secondByte >= 0) { 332 it->charValue = (firstByte << 8) | secondByte; 333 } 334 // else we'll handle the error later. 335 336 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 337 // Illegal second byte value. 338 it->error = TRUE; 339 } 340 341 return TRUE; 342} 343 344UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 345#if U_PLATFORM_IS_DARWIN_BASED 346 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis); 347#else 348 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); 349#endif 350 results->set(det, this, confidence); 351 return (confidence > 0); 352} 353 354const char *CharsetRecog_sjis::getName() const 355{ 356 return "Shift_JIS"; 357} 358 359const char *CharsetRecog_sjis::getLanguage() const 360{ 361 return "ja"; 362} 363 364CharsetRecog_euc::~CharsetRecog_euc() 365{ 366 // nothing to do 367} 368 369UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 370 int32_t firstByte = 0; 371 int32_t secondByte = 0; 372 int32_t thirdByte = 0; 373 374 it->index = it->nextIndex; 375 it->error = FALSE; 376 firstByte = it->charValue = it->nextByte(det); 377 378 if (firstByte < 0) { 379 // Ran off the end of the input data 380 return FALSE; 381 } 382 383 if (firstByte <= 0x8D) { 384 // single byte char 385 return TRUE; 386 } 387 388 secondByte = it->nextByte(det); 389 if (secondByte >= 0) { 390 it->charValue = (it->charValue << 8) | secondByte; 391 } 392 // else we'll handle the error later. 393 394 if (firstByte >= 0xA1 && firstByte <= 0xFE) { 395 // Two byte Char 396 if (secondByte < 0xA1) { 397 it->error = TRUE; 398 } 399 400 return TRUE; 401 } 402 403 if (firstByte == 0x8E) { 404 // Code Set 2. 405 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 406 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 407 // We don't know which we've got. 408 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 409 // bytes will look like a well formed 2 byte char. 410 if (secondByte < 0xA1) { 411 it->error = TRUE; 412 } 413 414 return TRUE; 415 } 416 417 if (firstByte == 0x8F) { 418 // Code set 3. 419 // Three byte total char size, two bytes of actual char value. 420 thirdByte = it->nextByte(det); 421 it->charValue = (it->charValue << 8) | thirdByte; 422 423 if (thirdByte < 0xa1) { 424 // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 425 it->error = TRUE; 426 } 427 } 428 429 return TRUE; 430 431} 432 433CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 434{ 435 // nothing to do 436} 437 438const char *CharsetRecog_euc_jp::getName() const 439{ 440 return "EUC-JP"; 441} 442 443const char *CharsetRecog_euc_jp::getLanguage() const 444{ 445 return "ja"; 446} 447 448UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 449{ 450#if U_PLATFORM_IS_DARWIN_BASED 451 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp); 452#else 453 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); 454#endif 455 results->set(det, this, confidence); 456 return (confidence > 0); 457} 458 459CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 460{ 461 // nothing to do 462} 463 464const char *CharsetRecog_euc_kr::getName() const 465{ 466 return "EUC-KR"; 467} 468 469const char *CharsetRecog_euc_kr::getLanguage() const 470{ 471 return "ko"; 472} 473 474UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 475{ 476#if U_PLATFORM_IS_DARWIN_BASED 477 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr); 478#else 479 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); 480#endif 481 results->set(det, this, confidence); 482 return (confidence > 0); 483} 484 485CharsetRecog_big5::~CharsetRecog_big5() 486{ 487 // nothing to do 488} 489 490UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 491{ 492 int32_t firstByte; 493 494 it->index = it->nextIndex; 495 it->error = FALSE; 496 firstByte = it->charValue = it->nextByte(det); 497 498 if (firstByte < 0) { 499 return FALSE; 500 } 501 502 if (firstByte <= 0x7F || firstByte == 0xFF) { 503 // single byte character. 504 return TRUE; 505 } 506 507 int32_t secondByte = it->nextByte(det); 508 if (secondByte >= 0) { 509 it->charValue = (it->charValue << 8) | secondByte; 510 } 511 // else we'll handle the error later. 512 513 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 514 it->error = TRUE; 515 } 516 517 return TRUE; 518} 519 520const char *CharsetRecog_big5::getName() const 521{ 522 return "Big5"; 523} 524 525const char *CharsetRecog_big5::getLanguage() const 526{ 527 return "zh"; 528} 529 530UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 531{ 532#if U_PLATFORM_IS_DARWIN_BASED 533 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5); 534#else 535 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); 536#endif 537 results->set(det, this, confidence); 538 return (confidence > 0); 539} 540 541CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 542{ 543 // nothing to do 544} 545 546UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 547 int32_t firstByte = 0; 548 int32_t secondByte = 0; 549 int32_t thirdByte = 0; 550 int32_t fourthByte = 0; 551 552 it->index = it->nextIndex; 553 it->error = FALSE; 554 firstByte = it->charValue = it->nextByte(det); 555 556 if (firstByte < 0) { 557 // Ran off the end of the input data 558 return FALSE; 559 } 560 561 if (firstByte <= 0x80) { 562 // single byte char 563 return TRUE; 564 } 565 566 secondByte = it->nextByte(det); 567 if (secondByte >= 0) { 568 it->charValue = (it->charValue << 8) | secondByte; 569 } 570 // else we'll handle the error later. 571 572 if (firstByte >= 0x81 && firstByte <= 0xFE) { 573 // Two byte Char 574 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 575 return TRUE; 576 } 577 578 // Four byte char 579 if (secondByte >= 0x30 && secondByte <= 0x39) { 580 thirdByte = it->nextByte(det); 581 582 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 583 fourthByte = it->nextByte(det); 584 585 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 586 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 587 588 return TRUE; 589 } 590 } 591 } 592 593 // Something wasn't valid, or we ran out of data (-1). 594 it->error = TRUE; 595 } 596 597 return TRUE; 598} 599 600const char *CharsetRecog_gb_18030::getName() const 601{ 602 return "GB18030"; 603} 604 605const char *CharsetRecog_gb_18030::getLanguage() const 606{ 607 return "zh"; 608} 609 610UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 611{ 612#if U_PLATFORM_IS_DARWIN_BASED 613 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030); 614#else 615 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); 616#endif 617 results->set(det, this, confidence); 618 return (confidence > 0); 619} 620 621U_NAMESPACE_END 622#endif 623