1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "csmatch.h"
13#include "csrmbcs.h"
14
15#include <math.h>
16
17U_NAMESPACE_BEGIN
18
19#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21#define min(x,y) (((x)<(y))?(x):(y))
22
23static const uint16_t commonChars_sjis [] = {
24// TODO:  This set of data comes from the character frequency-
25//        of-occurence analysis tool.  The data needs to be moved
26//        into a resource and loaded from there.
270x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
280x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
290x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
300x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
310x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
320x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33
34static const uint16_t commonChars_euc_jp[] = {
35// TODO:  This set of data comes from the character frequency-
36//        of-occurence analysis tool.  The data needs to be moved
37//        into a resource and loaded from there.
380xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
390xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
400xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
410xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
420xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
430xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
440xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
450xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
460xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
470xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48
49static const uint16_t commonChars_euc_kr[] = {
50// TODO:  This set of data comes from the character frequency-
51//        of-occurence analysis tool.  The data needs to be moved
52//        into a resource and loaded from there.
530xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
540xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
550xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
560xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
570xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
580xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
590xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
600xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
610xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
620xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63
64static const uint16_t commonChars_big5[] = {
65// TODO:  This set of data comes from the character frequency-
66//        of-occurence analysis tool.  The data needs to be moved
67//        into a resource and loaded from there.
680xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
690xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
700xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
710xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
720xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
730xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
740xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
750xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
760xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
770xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78
79static const uint16_t commonChars_gb_18030[] = {
80// TODO:  This set of data comes from the character frequency-
81//        of-occurence analysis tool.  The data needs to be moved
82//        into a resource and loaded from there.
830xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
840xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
850xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
860xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
870xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
880xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
890xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
900xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
910xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
920xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93
94#if U_PLATFORM_IS_DARWIN_BASED
95static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
96    {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
97    {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
98    {0}
99};
100static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
101    {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
102    {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
103    {0}
104};
105static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
106    {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
107    {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
108    {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
109    {0}
110};
111static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
112    {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
113    {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
114    {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
115    {0}
116};
117static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
118    {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
119    {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
120    {0}
121};
122#endif
123
124static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
125{
126    int32_t start = 0, end = len-1;
127    int32_t mid = (start+end)/2;
128
129    while(start <= end) {
130        if(array[mid] == value) {
131            return mid;
132        }
133
134        if(array[mid] < value){
135            start = mid+1;
136        } else {
137            end = mid-1;
138        }
139
140        mid = (start+end)/2;
141    }
142
143    return -1;
144}
145
146#if U_PLATFORM_IS_DARWIN_BASED
147// If testPrefix is a prefix of base, return its length, else return 0
148static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
149    const uint8_t *testPrefixStart = testPrefix;
150    while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
151        testPrefix++;
152        base++;
153    }
154    return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
155}
156#endif
157
158IteratedChar::IteratedChar() :
159charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
160{
161    // nothing else to do.
162}
163
164/*void IteratedChar::reset()
165{
166    charValue = 0;
167    index     = -1;
168    nextIndex = 0;
169    error     = FALSE;
170    done      = FALSE;
171}*/
172
173int32_t IteratedChar::nextByte(InputText *det)
174{
175    if (nextIndex >= det->fRawLength) {
176        done = TRUE;
177
178        return -1;
179    }
180
181    return det->fRawInput[nextIndex++];
182}
183
184CharsetRecog_mbcs::~CharsetRecog_mbcs()
185{
186    // nothing to do.
187}
188
189#if U_PLATFORM_IS_DARWIN_BASED
190int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
191#else
192int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
193#endif
194    int32_t singleByteCharCount = 0;
195    int32_t doubleByteCharCount = 0;
196    int32_t commonCharCount     = 0;
197    int32_t badCharCount        = 0;
198    int32_t totalCharCount      = 0;
199    int32_t confidence          = 0;
200#if U_PLATFORM_IS_DARWIN_BASED
201    int32_t confidenceFromKeys  = 0;
202#endif
203    IteratedChar iter;
204
205    while (nextChar(&iter, det)) {
206        totalCharCount++;
207
208        if (iter.error) {
209            badCharCount++;
210        } else {
211            if (iter.charValue <= 0xFF) {
212                singleByteCharCount++;
213            } else {
214                doubleByteCharCount++;
215
216                if (commonChars != 0) {
217                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
218                        commonCharCount += 1;
219                    }
220                }
221#if U_PLATFORM_IS_DARWIN_BASED
222                if (doubleByteCharCount <= 20) {
223                    int32_t keyIndex;
224                    for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
225                        int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
226                        confidenceFromKeys += prefixLen*5;
227                    }
228                }
229#endif
230            }
231        }
232
233
234        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
235            // Bail out early if the byte data is not matching the encoding scheme.
236            // break detectBlock;
237            return confidence;
238        }
239    }
240
241    if (doubleByteCharCount <= 10 && badCharCount == 0) {
242        // Not many multi-byte chars.
243        if (doubleByteCharCount == 0 && totalCharCount < 10) {
244            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
245            // We don't have enough data to have any confidence.
246            // Statistical analysis of single byte non-ASCII charcters would probably help here.
247            confidence = 0;
248        }
249        else {
250            //   ASCII or ISO file?  It's probably not our encoding,
251            //   but is not incompatible with our encoding, so don't give it a zero.
252#if U_PLATFORM_IS_DARWIN_BASED
253            if (confidenceFromKeys > 90) {
254                confidenceFromKeys = 90;
255            } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
256                confidenceFromKeys += 20;
257            }
258            confidence = 10 + confidenceFromKeys;
259#else
260            confidence = 10;
261#endif
262        }
263
264        return confidence;
265    }
266
267    //
268    //  No match if there are too many characters that don't fit the encoding scheme.
269    //    (should we have zero tolerance for these?)
270    //
271    if (doubleByteCharCount < 20*badCharCount) {
272        confidence = 0;
273
274        return confidence;
275    }
276
277    if (commonChars == 0) {
278        // We have no statistics on frequently occuring characters.
279        //  Assess confidence purely on having a reasonable number of
280        //  multi-byte characters (the more the better)
281        confidence = 30 + doubleByteCharCount - 20*badCharCount;
282#if U_PLATFORM_IS_DARWIN_BASED
283        confidence += confidenceFromKeys;
284#endif
285
286        if (confidence > 100) {
287            confidence = 100;
288        }
289    } else {
290        //
291        // Frequency of occurence statistics exist.
292        //
293
294        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
295        double scaleFactor = 90.0 / maxVal;
296        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
297#if U_PLATFORM_IS_DARWIN_BASED
298        confidence += confidenceFromKeys;
299#endif
300
301        confidence = min(confidence, 100);
302    }
303
304    if (confidence < 0) {
305        confidence = 0;
306    }
307
308    return confidence;
309}
310
311CharsetRecog_sjis::~CharsetRecog_sjis()
312{
313    // nothing to do
314}
315
316UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
317    it->index = it->nextIndex;
318    it->error = FALSE;
319
320    int32_t firstByte = it->charValue = it->nextByte(det);
321
322    if (firstByte < 0) {
323        return FALSE;
324    }
325
326    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
327        return TRUE;
328    }
329
330    int32_t secondByte = it->nextByte(det);
331    if (secondByte >= 0) {
332        it->charValue = (firstByte << 8) | secondByte;
333    }
334    // else we'll handle the error later.
335
336    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
337        // Illegal second byte value.
338        it->error = TRUE;
339    }
340
341    return TRUE;
342}
343
344UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
345#if U_PLATFORM_IS_DARWIN_BASED
346    int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
347#else
348    int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
349#endif
350    results->set(det, this, confidence);
351    return (confidence > 0);
352}
353
354const char *CharsetRecog_sjis::getName() const
355{
356    return "Shift_JIS";
357}
358
359const char *CharsetRecog_sjis::getLanguage() const
360{
361    return "ja";
362}
363
364CharsetRecog_euc::~CharsetRecog_euc()
365{
366    // nothing to do
367}
368
369UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
370    int32_t firstByte  = 0;
371    int32_t secondByte = 0;
372    int32_t thirdByte  = 0;
373
374    it->index = it->nextIndex;
375    it->error = FALSE;
376    firstByte = it->charValue = it->nextByte(det);
377
378    if (firstByte < 0) {
379        // Ran off the end of the input data
380        return FALSE;
381    }
382
383    if (firstByte <= 0x8D) {
384        // single byte char
385        return TRUE;
386    }
387
388    secondByte = it->nextByte(det);
389    if (secondByte >= 0) {
390        it->charValue = (it->charValue << 8) | secondByte;
391    }
392    // else we'll handle the error later.
393
394    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
395        // Two byte Char
396        if (secondByte < 0xA1) {
397            it->error = TRUE;
398        }
399
400        return TRUE;
401    }
402
403    if (firstByte == 0x8E) {
404        // Code Set 2.
405        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
406        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
407        // We don't know which we've got.
408        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
409        //   bytes will look like a well formed 2 byte char.
410        if (secondByte < 0xA1) {
411            it->error = TRUE;
412        }
413
414        return TRUE;
415    }
416
417    if (firstByte == 0x8F) {
418        // Code set 3.
419        // Three byte total char size, two bytes of actual char value.
420        thirdByte    = it->nextByte(det);
421        it->charValue = (it->charValue << 8) | thirdByte;
422
423        if (thirdByte < 0xa1) {
424            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
425            it->error = TRUE;
426        }
427    }
428
429    return TRUE;
430
431}
432
433CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
434{
435    // nothing to do
436}
437
438const char *CharsetRecog_euc_jp::getName() const
439{
440    return "EUC-JP";
441}
442
443const char *CharsetRecog_euc_jp::getLanguage() const
444{
445    return "ja";
446}
447
448UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
449{
450#if U_PLATFORM_IS_DARWIN_BASED
451    int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
452#else
453    int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
454#endif
455    results->set(det, this, confidence);
456    return (confidence > 0);
457}
458
459CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
460{
461    // nothing to do
462}
463
464const char *CharsetRecog_euc_kr::getName() const
465{
466    return "EUC-KR";
467}
468
469const char *CharsetRecog_euc_kr::getLanguage() const
470{
471    return "ko";
472}
473
474UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
475{
476#if U_PLATFORM_IS_DARWIN_BASED
477    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
478#else
479    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
480#endif
481    results->set(det, this, confidence);
482    return (confidence > 0);
483}
484
485CharsetRecog_big5::~CharsetRecog_big5()
486{
487    // nothing to do
488}
489
490UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
491{
492    int32_t firstByte;
493
494    it->index = it->nextIndex;
495    it->error = FALSE;
496    firstByte = it->charValue = it->nextByte(det);
497
498    if (firstByte < 0) {
499        return FALSE;
500    }
501
502    if (firstByte <= 0x7F || firstByte == 0xFF) {
503        // single byte character.
504        return TRUE;
505    }
506
507    int32_t secondByte = it->nextByte(det);
508    if (secondByte >= 0)  {
509        it->charValue = (it->charValue << 8) | secondByte;
510    }
511    // else we'll handle the error later.
512
513    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
514        it->error = TRUE;
515    }
516
517    return TRUE;
518}
519
520const char *CharsetRecog_big5::getName() const
521{
522    return "Big5";
523}
524
525const char *CharsetRecog_big5::getLanguage() const
526{
527    return "zh";
528}
529
530UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
531{
532#if U_PLATFORM_IS_DARWIN_BASED
533    int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
534#else
535    int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
536#endif
537    results->set(det, this, confidence);
538    return (confidence > 0);
539}
540
541CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
542{
543    // nothing to do
544}
545
546UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
547    int32_t firstByte  = 0;
548    int32_t secondByte = 0;
549    int32_t thirdByte  = 0;
550    int32_t fourthByte = 0;
551
552    it->index = it->nextIndex;
553    it->error = FALSE;
554    firstByte = it->charValue = it->nextByte(det);
555
556    if (firstByte < 0) {
557        // Ran off the end of the input data
558        return FALSE;
559    }
560
561    if (firstByte <= 0x80) {
562        // single byte char
563        return TRUE;
564    }
565
566    secondByte = it->nextByte(det);
567    if (secondByte >= 0) {
568        it->charValue = (it->charValue << 8) | secondByte;
569    }
570    // else we'll handle the error later.
571
572    if (firstByte >= 0x81 && firstByte <= 0xFE) {
573        // Two byte Char
574        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
575            return TRUE;
576        }
577
578        // Four byte char
579        if (secondByte >= 0x30 && secondByte <= 0x39) {
580            thirdByte = it->nextByte(det);
581
582            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
583                fourthByte = it->nextByte(det);
584
585                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
586                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
587
588                    return TRUE;
589                }
590            }
591        }
592
593        // Something wasn't valid, or we ran out of data (-1).
594        it->error = TRUE;
595    }
596
597    return TRUE;
598}
599
600const char *CharsetRecog_gb_18030::getName() const
601{
602    return "GB18030";
603}
604
605const char *CharsetRecog_gb_18030::getLanguage() const
606{
607    return "zh";
608}
609
610UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
611{
612#if U_PLATFORM_IS_DARWIN_BASED
613    int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
614#else
615    int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
616#endif
617    results->set(det, this, confidence);
618    return (confidence > 0);
619}
620
621U_NAMESPACE_END
622#endif
623