1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8
9#include "unicode/utypes.h"
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
14#include "unicode/uniset.h"
15
16#include "intltest.h"
17#include "csdetest.h"
18
19#include "xmlparser.h"
20
21#include <stdlib.h>
22#include <string.h>
23
24#ifdef DEBUG_DETECT
25#include <stdio.h>
26#endif
27
28#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33#define CH_SPACE 0x0020
34#define CH_SLASH 0x002F
35
36#define TEST_ASSERT(x) {if (!(x)) { \
37    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38
39#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41    return;}}
42
43
44//---------------------------------------------------------------------------
45//
46//  Test class boilerplate
47//
48//---------------------------------------------------------------------------
49CharsetDetectionTest::CharsetDetectionTest()
50{
51}
52
53
54CharsetDetectionTest::~CharsetDetectionTest()
55{
56}
57
58
59
60void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61{
62    if (exec) logln("TestSuite CharsetDetectionTest: ");
63    switch (index) {
64       case 0: name = "ConstructionTest";
65            if (exec) ConstructionTest();
66            break;
67
68       case 1: name = "UTF8Test";
69            if (exec) UTF8Test();
70            break;
71
72       case 2: name = "UTF16Test";
73            if (exec) UTF16Test();
74            break;
75
76       case 3: name = "C1BytesTest";
77            if (exec) C1BytesTest();
78            break;
79
80       case 4: name = "InputFilterTest";
81            if (exec) InputFilterTest();
82            break;
83
84       case 5: name = "DetectionTest";
85            if (exec) DetectionTest();
86            break;
87#if !UCONFIG_NO_LEGACY_CONVERSION
88       case 6: name = "IBM424Test";
89            if (exec) IBM424Test();
90            break;
91
92       case 7: name = "IBM420Test";
93            if (exec) IBM420Test();
94            break;
95#else
96       case 6:
97       case 7: name = "skip"; break;
98#endif
99       case 8: name = "Ticket6394Test";
100            if (exec) Ticket6394Test();
101            break;
102
103       case 9: name = "Ticket6954Test";
104            if (exec) Ticket6954Test();
105            break;
106
107        default: name = "";
108            break; //needed to end loop
109    }
110}
111
112static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113{
114    int32_t offset = -1;
115
116    splits = 1;
117    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118        splits += 1;
119    }
120
121    UnicodeString *result = new UnicodeString[splits];
122
123    int32_t start = 0;
124    int32_t split = 0;
125    int32_t end;
126
127    while((end = src.indexOf(ch, start)) >= 0) {
128        src.extractBetween(start, end, result[split++]);
129        start = end + 1;
130    }
131
132    src.extractBetween(start, src.length(), result[split]);
133
134    return result;
135}
136
137static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138{
139    int32_t sLength = source.length();
140    char *bytes = NULL;
141
142    length = source.extract(0, sLength, NULL, codepage);
143
144    if (length > 0) {
145        bytes = NEW_ARRAY(char, length + 1);
146        source.extract(0, sLength, bytes, codepage);
147    }
148
149    return bytes;
150}
151
152static void freeBytes(char *bytes)
153{
154    DELETE_ARRAY(bytes);
155}
156
157void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158{
159    int32_t splits = 0;
160    int32_t testLength = testString.length();
161    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162    UErrorCode status = U_ZERO_ERROR;
163    int32_t cpLength = eSplit[0].length();
164    char codepage[64];
165
166    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167    codepage[cpLength] = '\0';
168
169    LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170
171    int32_t byteLength = 0;
172    char *bytes = extractBytes(testString, codepage, byteLength);
173
174    if (bytes == NULL) {
175#if !UCONFIG_NO_LEGACY_CONVERSION
176        dataerrln("Can't open a " + encoding + " converter for " + id);
177#endif
178        return;
179    }
180
181    ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182
183    int32_t matchCount = 0;
184    const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185
186
187    UnicodeString name(ucsdet_getName(matches[0], &status));
188    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189    UChar *decoded = NULL;
190    int32_t dLength = 0;
191
192    if (matchCount == 0) {
193        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194        goto bail;
195    }
196
197    if (name.compare(eSplit[0]) != 0) {
198        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200#ifdef DEBUG_DETECT
201        for (int32_t m = 0; m < matchCount; m += 1) {
202            const char *name = ucsdet_getName(matches[m], &status);
203            const char *lang = ucsdet_getLanguage(matches[m], &status);
204            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206            printf("%s (%s) %d\n", name, lang, confidence);
207        }
208#endif
209        goto bail;
210    }
211
212    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214        goto bail;
215    }
216
217    decoded = NEW_ARRAY(UChar, testLength);
218    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220    if (testString.compare(decoded, dLength) != 0) {
221        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223#ifdef DEBUG_DETECT
224        for(int32_t i = 0; i < testLength; i += 1) {
225            if(testString[i] != decoded[i]) {
226                printf("Strings differ at byte %d\n", i);
227                break;
228            }
229        }
230#endif
231
232    }
233
234    DELETE_ARRAY(decoded);
235
236bail:
237    freeBytes(bytes);
238    delete[] eSplit;
239}
240
241const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242    UErrorCode status = U_ZERO_ERROR;
243    const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245    if (U_FAILURE(status)) {
246        errln("ERROR: getPath() failed - %s", u_errorName(status));
247        return NULL;
248    }
249
250    strcpy(buffer, testDataDirectory);
251    strcat(buffer, filename);
252    return buffer;
253}
254
255void CharsetDetectionTest::ConstructionTest()
256{
257    IcuTestErrorCode status(*this, "ConstructionTest");
258    LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259    LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260    int32_t count = uenum_count(e.getAlias(), status);
261
262#ifdef DEBUG_DETECT
263    printf("There are %d recognizers.\n", count);
264#endif
265
266    for(int32_t i = 0; i < count; i += 1) {
267        int32_t length;
268        const char *name = uenum_next(e.getAlias(), &length, status);
269
270        if(name == NULL || length <= 0) {
271            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272        }
273
274#ifdef DEBUG_DETECT
275        printf("%s\n", name);
276#endif
277    }
278}
279
280void CharsetDetectionTest::UTF8Test()
281{
282    UErrorCode status = U_ZERO_ERROR;
283    UnicodeString ss = "This is a string with some non-ascii characters that will "
284                       "be converted to UTF-8, then shoved through the detection process.  "
285                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
286                       "Sure would be nice if our source could contain Unicode directly!";
287    UnicodeString s = ss.unescape();
288    int32_t byteLength = 0, sLength = s.length();
289    char *bytes = extractBytes(s, "UTF-8", byteLength);
290    UCharsetDetector *csd = ucsdet_open(&status);
291    const UCharsetMatch *match;
292    UChar *detected = NEW_ARRAY(UChar, sLength);
293
294    ucsdet_setText(csd, bytes, byteLength, &status);
295    match = ucsdet_detect(csd, &status);
296
297    if (match == NULL) {
298        errln("Detection failure for UTF-8: got no matches.");
299        goto bail;
300    }
301
302    ucsdet_getUChars(match, detected, sLength, &status);
303
304    if (s.compare(detected, sLength) != 0) {
305        errln("Round-trip test failed!");
306    }
307
308    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
309
310bail:
311    DELETE_ARRAY(detected);
312    freeBytes(bytes);
313    ucsdet_close(csd);
314}
315
316void CharsetDetectionTest::UTF16Test()
317{
318    UErrorCode status = U_ZERO_ERROR;
319    /* Notice the BOM on the start of this string */
320    UChar chars[] = {
321        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
322        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
323        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
324        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
325        0x064a, 0x062a, 0x0000};
326    UnicodeString s(chars);
327    int32_t beLength = 0, leLength = 0;
328    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
329    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
330    UCharsetDetector *csd = ucsdet_open(&status);
331    const UCharsetMatch *match;
332    const char *name;
333    int32_t conf;
334
335    ucsdet_setText(csd, beBytes, beLength, &status);
336    match = ucsdet_detect(csd, &status);
337
338    if (match == NULL) {
339        errln("Encoding detection failure for UTF-16BE: got no matches.");
340        goto try_le;
341    }
342
343    name  = ucsdet_getName(match, &status);
344    conf  = ucsdet_getConfidence(match, &status);
345
346    if (strcmp(name, "UTF-16BE") != 0) {
347        errln("Encoding detection failure for UTF-16BE: got %s", name);
348        goto try_le; // no point in looking at confidence if we got the wrong character set.
349    }
350
351    if (conf != 100) {
352        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
353    }
354
355try_le:
356    ucsdet_setText(csd, leBytes, leLength, &status);
357    match = ucsdet_detect(csd, &status);
358
359    if (match == NULL) {
360        errln("Encoding detection failure for UTF-16LE: got no matches.");
361        goto bail;
362    }
363
364    name  = ucsdet_getName(match, &status);
365    conf = ucsdet_getConfidence(match, &status);
366
367
368    if (strcmp(name, "UTF-16LE") != 0) {
369        errln("Enconding detection failure for UTF-16LE: got %s", name);
370        goto bail; // no point in looking at confidence if we got the wrong character set.
371    }
372
373    if (conf != 100) {
374        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
375    }
376
377bail:
378    freeBytes(leBytes);
379    freeBytes(beBytes);
380    ucsdet_close(csd);
381}
382
383void CharsetDetectionTest::InputFilterTest()
384{
385    UErrorCode status = U_ZERO_ERROR;
386    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
387    UnicodeString s  = ss.unescape();
388    int32_t byteLength = 0;
389    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
390    UCharsetDetector *csd = ucsdet_open(&status);
391    const UCharsetMatch *match;
392    const char *lang, *name;
393
394    ucsdet_enableInputFilter(csd, TRUE);
395
396    if (!ucsdet_isInputFilterEnabled(csd)) {
397        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
398    }
399
400
401    ucsdet_setText(csd, bytes, byteLength, &status);
402    match = ucsdet_detect(csd, &status);
403
404    if (match == NULL) {
405        errln("Turning on the input filter resulted in no matches.");
406        goto turn_off;
407    }
408
409    name = ucsdet_getName(match, &status);
410
411    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
412        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
413    } else {
414        lang = ucsdet_getLanguage(match, &status);
415
416        if (lang == NULL || strcmp(lang, "fr") != 0) {
417            errln("Input filter did not strip markup!");
418        }
419    }
420
421turn_off:
422    ucsdet_enableInputFilter(csd, FALSE);
423    ucsdet_setText(csd, bytes, byteLength, &status);
424    match = ucsdet_detect(csd, &status);
425
426    if (match == NULL) {
427        errln("Turning off the input filter resulted in no matches.");
428        goto bail;
429    }
430
431    name = ucsdet_getName(match, &status);
432
433    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
434        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
435    } else {
436        lang = ucsdet_getLanguage(match, &status);
437
438        if (lang == NULL || strcmp(lang, "en") != 0) {
439            errln("Unfiltered input did not detect as English!");
440        }
441    }
442
443bail:
444    freeBytes(bytes);
445    ucsdet_close(csd);
446}
447
448void CharsetDetectionTest::C1BytesTest()
449{
450#if !UCONFIG_NO_LEGACY_CONVERSION
451    UErrorCode status = U_ZERO_ERROR;
452    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
453    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
454    UnicodeString sWindows  = ssWindows.unescape();
455    int32_t lISO = 0, lWindows = 0;
456    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
457    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
458    UCharsetDetector *csd = ucsdet_open(&status);
459    const UCharsetMatch *match;
460    const char *name;
461
462    ucsdet_setText(csd, bWindows, lWindows, &status);
463    match = ucsdet_detect(csd, &status);
464
465    if (match == NULL) {
466        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
467        goto bail;
468    }
469
470    name  = ucsdet_getName(match, &status);
471
472    if (strcmp(name, "windows-1252") != 0) {
473        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
474    }
475
476    ucsdet_setText(csd, bISO, lISO, &status);
477    match = ucsdet_detect(csd, &status);
478
479    if (match == NULL) {
480        errln("English text without C1 bytes got no matches.");
481        goto bail;
482    }
483
484    name  = ucsdet_getName(match, &status);
485
486    if (strcmp(name, "ISO-8859-1") != 0) {
487        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
488    }
489
490bail:
491    freeBytes(bWindows);
492    freeBytes(bISO);
493
494    ucsdet_close(csd);
495#endif
496}
497
498void CharsetDetectionTest::DetectionTest()
499{
500#if !UCONFIG_NO_REGULAR_EXPRESSIONS
501    UErrorCode status = U_ZERO_ERROR;
502    char path[2048];
503    const char *testFilePath = getPath(path, "csdetest.xml");
504
505    if (testFilePath == NULL) {
506        return; /* Couldn't get path: error message already output. */
507    }
508
509    UXMLParser  *parser = UXMLParser::createParser(status);
510    if (U_FAILURE(status)) {
511        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
512        return;
513    }
514
515    UXMLElement *root   = parser->parseFile(testFilePath, status);
516    if (!assertSuccess( "parseFile",status)) return;
517
518    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
519    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
520    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
521
522    const UXMLElement *testCase;
523    int32_t tc = 0;
524
525    while((testCase = root->nextChildElement(tc)) != NULL) {
526        if (testCase->getTagName().compare(test_case) == 0) {
527            const UnicodeString *id = testCase->getAttribute(id_attr);
528            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
529            const UnicodeString  text = testCase->getText(TRUE);
530            int32_t encodingCount;
531            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
532
533            for(int32_t e = 0; e < encodingCount; e += 1) {
534                checkEncoding(text, encodingList[e], *id);
535            }
536
537            delete[] encodingList;
538        }
539    }
540
541    delete root;
542    delete parser;
543#endif
544}
545
546void CharsetDetectionTest::IBM424Test()
547{
548    UErrorCode status = U_ZERO_ERROR;
549
550    static const UChar chars[] = {
551            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
552            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
553            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
554            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
555            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
556            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
557            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
558            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
559            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
560            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
561            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
562            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
563            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
564            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
565            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
566            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
567            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
568    };
569
570    static const UChar chars_reverse[] = {
571            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
572            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
573            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
574            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
575            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
576            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
577            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
578            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
579            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
580            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
581            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
582            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
583            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
584            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
585            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
586            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
587            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
588            0x0000
589    };
590
591    int32_t bLength = 0, brLength = 0;
592
593    UnicodeString s1(chars);
594    UnicodeString s2(chars_reverse);
595
596    char *bytes = extractBytes(s1, "IBM424", bLength);
597    char *bytes_r = extractBytes(s2, "IBM424", brLength);
598
599    UCharsetDetector *csd = ucsdet_open(&status);
600    if (U_FAILURE(status)) {
601        errln("Error opening charset detector. - %s", u_errorName(status));
602    }
603    const UCharsetMatch *match;
604    const char *name;
605
606    ucsdet_setText(csd, bytes, bLength, &status);
607    match = ucsdet_detect(csd, &status);
608
609    if (match == NULL) {
610        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
611        goto bail;
612    }
613
614    name  = ucsdet_getName(match, &status);
615    if (strcmp(name, "IBM424_rtl") != 0) {
616        errln("Encoding detection failure for IBM424_rtl: got %s", name);
617    }
618
619    ucsdet_setText(csd, bytes_r, brLength, &status);
620    match = ucsdet_detect(csd, &status);
621
622    if (match == NULL) {
623        errln("Encoding detection failure for IBM424_ltr: got no matches.");
624        goto bail;
625    }
626
627    name  = ucsdet_getName(match, &status);
628    if (strcmp(name, "IBM424_ltr") != 0) {
629        errln("Encoding detection failure for IBM424_ltr: got %s", name);
630    }
631
632bail:
633    freeBytes(bytes);
634    freeBytes(bytes_r);
635    ucsdet_close(csd);
636}
637
638void CharsetDetectionTest::IBM420Test()
639{
640    UErrorCode status = U_ZERO_ERROR;
641
642    static const UChar chars[] = {
643        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
644        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
645        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
646        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
647        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
648        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
649        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
650        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
651        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
652        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
653        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
654        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
655        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
656        0x0000
657    };
658    static const UChar chars_reverse[] = {
659        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
660        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
661        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
662        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
663        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
664        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
665        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
666        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
667        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
668        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
669        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
670        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
671        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
672        0x0000,
673    };
674
675    int32_t bLength = 0, brLength = 0;
676
677    UnicodeString s1(chars);
678    UnicodeString s2(chars_reverse);
679
680    char *bytes = extractBytes(s1, "IBM420", bLength);
681    char *bytes_r = extractBytes(s2, "IBM420", brLength);
682
683    UCharsetDetector *csd = ucsdet_open(&status);
684    if (U_FAILURE(status)) {
685        errln("Error opening charset detector. - %s", u_errorName(status));
686    }
687    const UCharsetMatch *match;
688    const char *name;
689
690    ucsdet_setText(csd, bytes, bLength, &status);
691    match = ucsdet_detect(csd, &status);
692
693    if (match == NULL) {
694        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
695        goto bail;
696    }
697
698    name  = ucsdet_getName(match, &status);
699    if (strcmp(name, "IBM420_rtl") != 0) {
700        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
701    }
702
703    ucsdet_setText(csd, bytes_r, brLength, &status);
704    match = ucsdet_detect(csd, &status);
705
706    if (match == NULL) {
707        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
708        goto bail;
709    }
710
711    name  = ucsdet_getName(match, &status);
712    if (strcmp(name, "IBM420_ltr") != 0) {
713        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
714    }
715
716bail:
717    freeBytes(bytes);
718    freeBytes(bytes_r);
719    ucsdet_close(csd);
720}
721
722
723void CharsetDetectionTest::Ticket6394Test() {
724#if !UCONFIG_NO_CONVERSION
725    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
726                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
727                             "encodings more than once.  The hop through UnicodeString is for platforms "
728                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
729    char latin1Text[sizeof(charText)];
730    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
731
732    UErrorCode status = U_ZERO_ERROR;
733    UCharsetDetector *csd = ucsdet_open(&status);
734    ucsdet_setText(csd, latin1Text, -1, &status);
735    if (U_FAILURE(status)) {
736        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
737        return;
738    }
739
740    int32_t matchCount = 0;
741    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
742    if (U_FAILURE(status)) {
743        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
744        return;
745    }
746
747    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
748    int32_t i;
749    for (i=0; i<matchCount; i++) {
750        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
751        if (U_FAILURE(status)) {
752            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
753            status = U_ZERO_ERROR;
754        }
755        if (setOfCharsetNames.contains(charSetName)) {
756            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
757            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
758        }
759        setOfCharsetNames.add(charSetName);
760    }
761    ucsdet_close(csd);
762#endif
763}
764
765
766// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
767//               similar Windows and non-Windows SBCS encodings. State was kept in the shared
768//               Charset Recognizer objects, and could be overwritten.
769void CharsetDetectionTest::Ticket6954Test() {
770#if !UCONFIG_NO_CONVERSION
771    UErrorCode status = U_ZERO_ERROR;
772    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
773    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
774                            "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
775    UnicodeString sWindows  = ssWindows.unescape();
776    int32_t lISO = 0, lWindows = 0;
777    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
778    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
779
780    // First do a plain vanilla detect of 1252 text
781
782    UCharsetDetector *csd1 = ucsdet_open(&status);
783    ucsdet_setText(csd1, bWindows, lWindows, &status);
784    const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
785    const char *name1 = ucsdet_getName(match1, &status);
786    TEST_ASSERT_SUCCESS(status);
787    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
788
789    // Next, using a completely separate detector, detect some 8859-1 text
790
791    UCharsetDetector *csd2 = ucsdet_open(&status);
792    ucsdet_setText(csd2, bISO, lISO, &status);
793    const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
794    const char *name2 = ucsdet_getName(match2, &status);
795    TEST_ASSERT_SUCCESS(status);
796    TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
797
798    // Recheck the 1252 results from the first detector, which should not have been
799    //  altered by the use of a different detector.
800
801    name1 = ucsdet_getName(match1, &status);
802    TEST_ASSERT_SUCCESS(status);
803    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
804
805    ucsdet_close(csd1);
806    ucsdet_close(csd2);
807    freeBytes(bISO);
808    freeBytes(bWindows);
809#endif
810}
811