1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationtest.cpp
7*
8* created on: 2012apr27
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/coll.h"
17#include "unicode/errorcode.h"
18#include "unicode/localpointer.h"
19#include "unicode/normalizer2.h"
20#include "unicode/sortkey.h"
21#include "unicode/std_string.h"
22#include "unicode/strenum.h"
23#include "unicode/tblcoll.h"
24#include "unicode/uiter.h"
25#include "unicode/uniset.h"
26#include "unicode/unistr.h"
27#include "unicode/usetiter.h"
28#include "unicode/ustring.h"
29#include "charstr.h"
30#include "cmemory.h"
31#include "collation.h"
32#include "collationdata.h"
33#include "collationfcd.h"
34#include "collationiterator.h"
35#include "collationroot.h"
36#include "collationrootelements.h"
37#include "collationruleparser.h"
38#include "collationweights.h"
39#include "cstring.h"
40#include "intltest.h"
41#include "normalizer2impl.h"
42#include "ucbuf.h"
43#include "uhash.h"
44#include "uitercollationiterator.h"
45#include "utf16collationiterator.h"
46#include "utf8collationiterator.h"
47#include "uvectr32.h"
48#include "uvectr64.h"
49#include "writesrc.h"
50
51#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52
53// TODO: Move to ucbuf.h
54U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
55
56class CodePointIterator;
57
58// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
59
60class CollationTest : public IntlTest {
61public:
62    CollationTest()
63            : fcd(NULL), nfd(NULL),
64              fileLineNumber(0),
65              coll(NULL) {}
66
67    ~CollationTest() {
68        delete coll;
69    }
70
71    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
72
73    void TestMinMax();
74    void TestImplicits();
75    void TestNulTerminated();
76    void TestIllegalUTF8();
77    void TestShortFCDData();
78    void TestFCD();
79    void TestCollationWeights();
80    void TestRootElements();
81    void TestTailoredElements();
82    void TestDataDriven();
83
84private:
85    void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86    void checkAllocWeights(CollationWeights &cw,
87                           uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88                           int32_t someLength, int32_t minCount);
89
90    static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91    static UnicodeString printCollationKey(const CollationKey &key);
92
93    // Helpers & fields for data-driven test.
94    static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
95    static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
96    static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
97    int32_t skipSpaces(int32_t i) {
98        while(isSpace(fileLine[i])) { ++i; }
99        return i;
100    }
101
102    UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103    void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104    Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105    void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106    void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107    void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108    void setRootCollator(IcuTestErrorCode &errorCode);
109    void setLocaleCollator(IcuTestErrorCode &errorCode);
110
111    UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112
113    UBool getSortKeyParts(const UChar *s, int32_t length,
114                          CharString &dest, int32_t partSize,
115                          IcuTestErrorCode &errorCode);
116    UBool getCollationKey(const char *norm, const UnicodeString &line,
117                          const UChar *s, int32_t length,
118                          CollationKey &key, IcuTestErrorCode &errorCode);
119    UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120                          const UnicodeString &prevString, const UnicodeString &s,
121                          UCollationResult expectedOrder, Collation::Level expectedLevel,
122                          IcuTestErrorCode &errorCode);
123    void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125    const Normalizer2 *fcd, *nfd;
126    UnicodeString fileLine;
127    int32_t fileLineNumber;
128    UnicodeString fileTestName;
129    Collator *coll;
130};
131
132extern IntlTest *createCollationTest() {
133    return new CollationTest();
134}
135
136void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137    if(exec) {
138        logln("TestSuite CollationTest: ");
139    }
140    TESTCASE_AUTO_BEGIN;
141    TESTCASE_AUTO(TestMinMax);
142    TESTCASE_AUTO(TestImplicits);
143    TESTCASE_AUTO(TestNulTerminated);
144    TESTCASE_AUTO(TestIllegalUTF8);
145    TESTCASE_AUTO(TestShortFCDData);
146    TESTCASE_AUTO(TestFCD);
147    TESTCASE_AUTO(TestCollationWeights);
148    TESTCASE_AUTO(TestRootElements);
149    TESTCASE_AUTO(TestTailoredElements);
150    TESTCASE_AUTO(TestDataDriven);
151    TESTCASE_AUTO_END;
152}
153
154void CollationTest::TestMinMax() {
155    IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157    setRootCollator(errorCode);
158    if(errorCode.isFailure()) {
159        errorCode.reset();
160        return;
161    }
162    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163    if(rbc == NULL) {
164        errln("the root collator is not a RuleBasedCollator");
165        return;
166    }
167
168    static const UChar s[2] = { 0xfffe, 0xffff };
169    UVector64 ces(errorCode);
170    rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171    errorCode.assertSuccess();
172    if(ces.size() != 2) {
173        errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174        return;
175    }
176    int64_t ce = ces.elementAti(0);
177    int64_t expected =
178        ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
179        Collation::MERGE_SEPARATOR_LOWER32;
180    if(ce != expected) {
181        errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
182    }
183
184    ce = ces.elementAti(1);
185    expected = Collation::makeCE(Collation::MAX_PRIMARY);
186    if(ce != expected) {
187        errln("CE(U+ffff)=%04lx != max..", (long)ce);
188    }
189}
190
191void CollationTest::TestImplicits() {
192    IcuTestErrorCode errorCode(*this, "TestImplicits");
193
194    const CollationData *cd = CollationRoot::getData(errorCode);
195    if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
196        return;
197    }
198
199    // Implicit primary weights should be assigned for the following sets,
200    // and sort in ascending order by set and then code point.
201    // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202    // core Han Unified Ideographs
203    UnicodeSet coreHan("[\\p{unified_ideograph}&"
204                            "[\\p{Block=CJK_Unified_Ideographs}"
205                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
206                       errorCode);
207    // all other Unified Han ideographs
208    UnicodeSet otherHan("[\\p{unified ideograph}-"
209                            "[\\p{Block=CJK_Unified_Ideographs}"
210                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
211                        errorCode);
212    UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
213    unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
214    if(errorCode.logIfFailureAndReset("UnicodeSet")) {
215        return;
216    }
217    const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
218    UChar32 prev = 0;
219    uint32_t prevPrimary = 0;
220    UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
221    for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
222        LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
223        while(iter->next()) {
224            UChar32 c = iter->getCodepoint();
225            UnicodeString s(c);
226            ci.setText(s.getBuffer(), s.getBuffer() + s.length());
227            int64_t ce = ci.nextCE(errorCode);
228            int64_t ce2 = ci.nextCE(errorCode);
229            if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
230                return;
231            }
232            if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
233                errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
234                continue;
235            }
236            if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
237                errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238                      (long)c, (long)(ce & 0xffffffff));
239                continue;
240            }
241            uint32_t primary = (uint32_t)(ce >> 32);
242            if(!(primary > prevPrimary)) {
243                errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244                      (long)c, (long)primary, (long)prev, (long)prevPrimary);
245            }
246            prev = c;
247            prevPrimary = primary;
248        }
249    }
250}
251
252void CollationTest::TestNulTerminated() {
253    IcuTestErrorCode errorCode(*this, "TestNulTerminated");
254    const CollationData *data = CollationRoot::getData(errorCode);
255    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
256        return;
257    }
258
259    static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
260
261    UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
262    UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
263    for(int32_t i = 0;; ++i) {
264        int64_t ce1 = ci1.nextCE(errorCode);
265        int64_t ce2 = ci2.nextCE(errorCode);
266        if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
267            return;
268        }
269        if(ce1 != ce2) {
270            errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
271            break;
272        }
273        if(ce1 == Collation::NO_CE) { break; }
274    }
275}
276
277void CollationTest::TestIllegalUTF8() {
278    IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
279
280    setRootCollator(errorCode);
281    if(errorCode.isFailure()) {
282        errorCode.reset();
283        return;
284    }
285    coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
286
287    static const char *strings[] = {
288        // U+FFFD
289        "a\xef\xbf\xbdz",
290        // illegal byte sequences
291        "a\x80z",  // trail byte
292        "a\xc1\x81z",  // non-shortest form
293        "a\xe0\x82\x83z",  // non-shortest form
294        "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
295        "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
296        "a\xf0\x8f\xbf\xbfz",  // non-shortest form
297        "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
298    };
299
300    StringPiece fffd(strings[0]);
301    for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
302        StringPiece illegal(strings[i]);
303        UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
304        if(order != UCOL_EQUAL) {
305            errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
306                  (int)i, order);
307        }
308    }
309}
310
311namespace {
312
313void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
314    for(UChar32 c = 0x10000; c < 0x110000;) {
315        UChar32 next = c + 0x400;
316        if(src.containsSome(c, next - 1)) {
317            dest.add(U16_LEAD(c));
318        }
319        c = next;
320    }
321}
322
323}  // namespace
324
325void CollationTest::TestShortFCDData() {
326    // See CollationFCD class comments.
327    IcuTestErrorCode errorCode(*this, "TestShortFCDData");
328    UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
329    errorCode.assertSuccess();
330    expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
331    addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
332    UnicodeSet lccc;  // actual
333    for(UChar32 c = 0; c <= 0xffff; ++c) {
334        if(CollationFCD::hasLccc(c)) { lccc.add(c); }
335    }
336    UnicodeSet diff(expectedLccc);
337    diff.removeAll(lccc);
338    diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
339    UnicodeString empty("[]");
340    UnicodeString diffString;
341    diff.toPattern(diffString, TRUE);
342    assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
343    diff = lccc;
344    diff.removeAll(expectedLccc);
345    diff.toPattern(diffString, TRUE);
346    assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
347
348    UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
349    if (errorCode.isSuccess()) {
350        addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
351        addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
352        UnicodeSet tccc;  // actual
353        for(UChar32 c = 0; c <= 0xffff; ++c) {
354            if(CollationFCD::hasTccc(c)) { tccc.add(c); }
355        }
356        diff = expectedTccc;
357        diff.removeAll(tccc);
358        diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
359        assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
360        diff = tccc;
361        diff.removeAll(expectedTccc);
362        diff.toPattern(diffString, TRUE);
363        assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
364    }
365}
366
367class CodePointIterator {
368public:
369    CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
370    void resetToStart() { pos = 0; }
371    UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
372    UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
373    int32_t getLength() const { return length; }
374    int getIndex() const { return (int)pos; }
375private:
376    const UChar32 *cp;
377    int32_t length;
378    int32_t pos;
379};
380
381void CollationTest::checkFCD(const char *name,
382                             CollationIterator &ci, CodePointIterator &cpi) {
383    IcuTestErrorCode errorCode(*this, "checkFCD");
384
385    // Iterate forward to the limit.
386    for(;;) {
387        UChar32 c1 = ci.nextCodePoint(errorCode);
388        UChar32 c2 = cpi.next();
389        if(c1 != c2) {
390            errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391                  name, (long)c1, (long)c2, cpi.getIndex());
392            return;
393        }
394        if(c1 < 0) { break; }
395    }
396
397    // Iterate backward most of the way.
398    for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
399        UChar32 c1 = ci.previousCodePoint(errorCode);
400        UChar32 c2 = cpi.previous();
401        if(c1 != c2) {
402            errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403                  name, (long)c1, (long)c2, cpi.getIndex());
404            return;
405        }
406    }
407
408    // Forward again.
409    for(;;) {
410        UChar32 c1 = ci.nextCodePoint(errorCode);
411        UChar32 c2 = cpi.next();
412        if(c1 != c2) {
413            errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414                  name, (long)c1, (long)c2, cpi.getIndex());
415            return;
416        }
417        if(c1 < 0) { break; }
418    }
419
420    // Iterate backward to the start.
421    for(;;) {
422        UChar32 c1 = ci.previousCodePoint(errorCode);
423        UChar32 c2 = cpi.previous();
424        if(c1 != c2) {
425            errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426                  name, (long)c1, (long)c2, cpi.getIndex());
427            return;
428        }
429        if(c1 < 0) { break; }
430    }
431}
432
433void CollationTest::TestFCD() {
434    IcuTestErrorCode errorCode(*this, "TestFCD");
435    const CollationData *data = CollationRoot::getData(errorCode);
436    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
437        return;
438    }
439
440    // Input string, not FCD, NUL-terminated.
441    static const UChar s[] = {
442        0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444        0x327, 0x308,  // ccc=202, 230
445        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
448        0xac01,
449        0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
450        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451        0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
452        0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
453        0x4e00, 0xf81,
454        0
455    };
456    // Expected code points.
457    static const UChar32 cp[] = {
458        0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459        0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
460        0x1D15F, 0x1D16D,
461        0xac01,
462        0x63, 0x327, 0x1D165, 0x1D16D,
463        0x61,
464        0xf71, 0xf71, 0xf72, 0xf74, 0x301,
465        0x4e00, 0xf71, 0xf80
466    };
467
468    FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
469    if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
470        return;
471    }
472    CodePointIterator cpi(cp, LENGTHOF(cp));
473    checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
474
475#if U_HAVE_STD_STRING
476    cpi.resetToStart();
477    std::string utf8;
478    UnicodeString(s).toUTF8String(utf8);
479    FCDUTF8CollationIterator u8ci(data, FALSE,
480                                  reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
481    if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
482        return;
483    }
484    checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
485#endif
486
487    cpi.resetToStart();
488    UCharIterator iter;
489    uiter_setString(&iter, s, LENGTHOF(s) - 1);  // -1: without the terminating NUL
490    FCDUIterCollationIterator uici(data, FALSE, iter, 0);
491    if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
492        return;
493    }
494    checkFCD("FCDUIterCollationIterator", uici, cpi);
495}
496
497void CollationTest::checkAllocWeights(CollationWeights &cw,
498                                      uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
499                                      int32_t someLength, int32_t minCount) {
500    if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
501        errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502              (long)lowerLimit, (long)upperLimit, (long)n);
503        return;
504    }
505    uint32_t previous = lowerLimit;
506    int32_t count = 0;  // number of weights that have someLength
507    for(int32_t i = 0; i < n; ++i) {
508        uint32_t w = cw.nextWeight();
509        if(w == 0xffffffff) {
510            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511                  "returns only %ld weights",
512                  (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
513            return;
514        }
515        if(!(previous < w && w < upperLimit)) {
516            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517                  "number %ld -> %lx not between %lx and %lx",
518                  (long)lowerLimit, (long)upperLimit, (long)n,
519                  (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
520            return;
521        }
522        if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
523    }
524    if(count < minCount) {
525        errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526              "returns only %ld < %ld weights of length %d",
527              (long)lowerLimit, (long)upperLimit, (long)n,
528              (long)count, (long)minCount, (int)someLength);
529    }
530}
531
532void CollationTest::TestCollationWeights() {
533    CollationWeights cw;
534
535    // Non-compressible primaries use 254 second bytes 02..FF.
536    logln("CollationWeights.initForPrimary(non-compressible)");
537    cw.initForPrimary(FALSE);
538    // Expect 1 weight 11 and 254 weights 12xx.
539    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
540    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
541    // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
543    // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
545    // Expect 254^2=64516 three-byte weights.
546    // During computation, there should be 3 three-byte ranges
547    // 10ffff, 11xxxx, 120202.
548    // The middle one should be split 64515:1,
549    // and the newly-split-off range and the last ranged lengthened.
550    checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551    // Expect weights 1102 & 1103.
552    checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
553    // Expect weights 102102 & 102103.
554    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
555
556    // Compressible primaries use 251 second bytes 04..FE.
557    logln("CollationWeights.initForPrimary(compressible)");
558    cw.initForPrimary(TRUE);
559    // Expect 1 weight 11 and 251 weights 12xx.
560    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
561    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
562    // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563    checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
564    // Expect weights 1104 & 1105.
565    checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
566    // Expect weights 102102 & 102103.
567    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
568
569    // Secondary and tertiary weights use only bytes 3 & 4.
570    logln("CollationWeights.initForSecondary()");
571    cw.initForSecondary();
572    // Expect weights fbxx and all four fc..ff.
573    checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
574
575    logln("CollationWeights.initForTertiary()");
576    cw.initForTertiary();
577    // Expect weights 3dxx and both 3e & 3f.
578    checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
579}
580
581namespace {
582
583UBool isValidCE(const CollationRootElements &re, const CollationData &data,
584                uint32_t p, uint32_t s, uint32_t ctq) {
585    uint32_t p1 = p >> 24;
586    uint32_t p2 = (p >> 16) & 0xff;
587    uint32_t p3 = (p >> 8) & 0xff;
588    uint32_t p4 = p & 0xff;
589    uint32_t s1 = s >> 8;
590    uint32_t s2 = s & 0xff;
591    // ctq = Case, Tertiary, Quaternary
592    uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
593    uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
594    uint32_t t1 = t >> 8;
595    uint32_t t2 = t & 0xff;
596    uint32_t q = ctq & Collation::QUATERNARY_MASK;
597    // No leading zero bytes.
598    if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
599        return FALSE;
600    }
601    // No intermediate zero bytes.
602    if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
603        return FALSE;
604    }
605    if(p2 != 0 && p3 == 0 && p4 != 0) {
606        return FALSE;
607    }
608    // Minimum & maximum lead bytes.
609    if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
610            (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
611            (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
612        return FALSE;
613    }
614    if(t1 != 0 && t1 > 0x3f) {
615        return FALSE;
616    }
617    if(c > 2) {
618        return FALSE;
619    }
620    // The valid byte range for the second primary byte depends on compressibility.
621    if(p2 != 0) {
622        if(data.isCompressibleLeadByte(p1)) {
623            if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
624                    Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
625                return FALSE;
626            }
627        } else {
628            if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
629                return FALSE;
630            }
631        }
632    }
633    // Other bytes just need to avoid the level separator.
634    // Trailing zeros are ok.
635    U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
636    if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
637            s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
638        return FALSE;
639    }
640    // Well-formed CEs.
641    if(p == 0) {
642        if(s == 0) {
643            if(t == 0) {
644                // Completely ignorable CE.
645                // Quaternary CEs are not supported.
646                if(c != 0 || q != 0) {
647                    return FALSE;
648                }
649            } else {
650                // Tertiary CE.
651                if(t < re.getTertiaryBoundary() || c != 2) {
652                    return FALSE;
653                }
654            }
655        } else {
656            // Secondary CE.
657            if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
658                return FALSE;
659            }
660        }
661    } else {
662        // Primary CE.
663        if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
664                s >= re.getSecondaryBoundary()) {
665            return FALSE;
666        }
667        if(t == 0 || t >= re.getTertiaryBoundary()) {
668            return FALSE;
669        }
670    }
671    return TRUE;
672}
673
674UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
675    uint32_t p = (uint32_t)(ce >> 32);
676    uint32_t secTer = (uint32_t)ce;
677    return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
678}
679
680class RootElementsIterator {
681public:
682    RootElementsIterator(const CollationData &root)
683            : data(root),
684              elements(root.rootElements), length(root.rootElementsLength),
685              pri(0), secTer(0),
686              index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
687
688    UBool next() {
689        if(index >= length) { return FALSE; }
690        uint32_t p = elements[index];
691        if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
692        if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
693            ++index;
694            secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
695            return TRUE;
696        }
697        if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
698            // End of a range, enumerate the primaries in the range.
699            int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
700            p &= 0xffffff00;
701            if(pri == p) {
702                // Finished the range, return the next CE after it.
703                ++index;
704                return next();
705            }
706            U_ASSERT(pri < p);
707            // Return the next primary in this range.
708            UBool isCompressible = data.isCompressiblePrimary(pri);
709            if((pri & 0xffff) == 0) {
710                pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
711            } else {
712                pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
713            }
714            return TRUE;
715        }
716        // Simple primary CE.
717        ++index;
718        pri = p;
719        secTer = Collation::COMMON_SEC_AND_TER_CE;
720        return TRUE;
721    }
722
723    uint32_t getPrimary() const { return pri; }
724    uint32_t getSecTer() const { return secTer; }
725
726private:
727    const CollationData &data;
728    const uint32_t *elements;
729    int32_t length;
730
731    uint32_t pri;
732    uint32_t secTer;
733    int32_t index;
734};
735
736}  // namespace
737
738void CollationTest::TestRootElements() {
739    IcuTestErrorCode errorCode(*this, "TestRootElements");
740    const CollationData *root = CollationRoot::getData(errorCode);
741    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
742        return;
743    }
744    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
745    RootElementsIterator iter(*root);
746
747    // We check each root CE for validity,
748    // and we also verify that there is a tailoring gap between each two CEs.
749    CollationWeights cw1c;  // compressible primary weights
750    CollationWeights cw1u;  // uncompressible primary weights
751    CollationWeights cw2;
752    CollationWeights cw3;
753
754    cw1c.initForPrimary(TRUE);
755    cw1u.initForPrimary(FALSE);
756    cw2.initForSecondary();
757    cw3.initForTertiary();
758
759    // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760    // nor the special merge-separator CE for U+FFFE.
761    uint32_t prevPri = 0;
762    uint32_t prevSec = 0;
763    uint32_t prevTer = 0;
764    while(iter.next()) {
765        uint32_t pri = iter.getPrimary();
766        uint32_t secTer = iter.getSecTer();
767        // CollationRootElements CEs must have 0 case and quaternary bits.
768        if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
769            errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770                  (long)pri, (long)secTer);
771        }
772        uint32_t sec = secTer >> 16;
773        uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
774        uint32_t ctq = ter;
775        if(pri == 0 && sec == 0 && ter != 0) {
776            // Tertiary CEs must have uppercase bits,
777            // but they are not stored in the CollationRootElements.
778            ctq |= 0x8000;
779        }
780        if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
781            errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
782        } else {
783            if(pri != prevPri) {
784                uint32_t newWeight = 0;
785                if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
786                    // There is currently no tailoring gap after primary ignorables,
787                    // and we forbid tailoring after U+FFFD and U+FFFF.
788                } else if(root->isCompressiblePrimary(prevPri)) {
789                    if(!cw1c.allocWeights(prevPri, pri, 1)) {
790                        errln("no primary/compressible tailoring gap between %08lx and %08lx",
791                              (long)prevPri, (long)pri);
792                    } else {
793                        newWeight = cw1c.nextWeight();
794                    }
795                } else {
796                    if(!cw1u.allocWeights(prevPri, pri, 1)) {
797                        errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798                              (long)prevPri, (long)pri);
799                    } else {
800                        newWeight = cw1u.nextWeight();
801                    }
802                }
803                if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
804                    errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805                          (long)prevPri, (long)newWeight, (long)pri);
806                }
807            } else if(sec != prevSec) {
808                uint32_t lowerLimit =
809                    prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
810                if(!cw2.allocWeights(lowerLimit, sec, 1)) {
811                    errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
812                } else {
813                    uint32_t newWeight = cw2.nextWeight();
814                    if(!(prevSec < newWeight && newWeight < sec)) {
815                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816                              (long)lowerLimit, (long)newWeight, (long)sec);
817                    }
818                }
819            } else if(ter != prevTer) {
820                uint32_t lowerLimit =
821                    prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
822                if(!cw3.allocWeights(lowerLimit, ter, 1)) {
823                    errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
824                } else {
825                    uint32_t newWeight = cw3.nextWeight();
826                    if(!(prevTer < newWeight && newWeight < ter)) {
827                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828                              (long)lowerLimit, (long)newWeight, (long)ter);
829                    }
830                }
831            } else {
832                errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
833            }
834        }
835        prevPri = pri;
836        prevSec = sec;
837        prevTer = ter;
838    }
839}
840
841void CollationTest::TestTailoredElements() {
842    IcuTestErrorCode errorCode(*this, "TestTailoredElements");
843    const CollationData *root = CollationRoot::getData(errorCode);
844    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
845        return;
846    }
847    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
848
849    UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
850    if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
851        return;
852    }
853    uhash_setKeyDeleter(prevLocales, uprv_free);
854    // TestRootElements() tests the root collator which does not have tailorings.
855    uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
856    uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
857    uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
858
859    UVector64 ces(errorCode);
860    LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
861    U_ASSERT(locales.isValid());
862    const char *localeID = "root";
863    do {
864        Locale locale(localeID);
865        LocalPointer<StringEnumeration> types(
866                Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
867        errorCode.assertSuccess();
868        const char *type = NULL;  // default type
869        do {
870            Locale localeWithType(locale);
871            if(type != NULL) {
872                localeWithType.setKeywordValue("collation", type, errorCode);
873            }
874            errorCode.assertSuccess();
875            LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
876            if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
877                                              localeWithType.getName())) {
878                continue;
879            }
880            Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
881            if(uhash_geti(prevLocales, actual.getName()) != 0) {
882                continue;
883            }
884            uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
885            errorCode.assertSuccess();
886            logln("TestTailoredElements(): requested %s -> actual %s",
887                  localeWithType.getName(), actual.getName());
888            RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
889            if(rbc == NULL) {
890                continue;
891            }
892            // Note: It would be better to get tailored strings such that we can
893            // identify the prefix, and only get the CEs for the prefix+string,
894            // not also for the prefix.
895            // There is currently no API for that.
896            // It would help in an unusual case where a contraction starting in the prefix
897            // extends past its end, and we do not see the intended mapping.
898            // For example, for a mapping p|st, if there is also a contraction ps,
899            // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900            LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
901            errorCode.assertSuccess();
902            UnicodeSetIterator iter(*tailored);
903            while(iter.next()) {
904                const UnicodeString &s = iter.getString();
905                ces.removeAllElements();
906                rbc->internalGetCEs(s, ces, errorCode);
907                errorCode.assertSuccess();
908                for(int32_t i = 0; i < ces.size(); ++i) {
909                    int64_t ce = ces.elementAti(i);
910                    if(!isValidCE(rootElements, *root, ce)) {
911                        errln("invalid tailored CE %016llx at CE index %d from string:",
912                              (long long)ce, (int)i);
913                        infoln(prettify(s));
914                    }
915                }
916            }
917        } while((type = types->next(NULL, errorCode)) != NULL);
918    } while((localeID = locales->next(NULL, errorCode)) != NULL);
919    uhash_close(prevLocales);
920}
921
922UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
923    UnicodeString s;
924    for(int32_t i = 0; i < length; ++i) {
925        if(i > 0) { s.append((UChar)0x20); }
926        uint8_t b = p[i];
927        if(b == 0) {
928            s.append((UChar)0x2e);  // period
929        } else if(b == 1) {
930            s.append((UChar)0x7c);  // vertical bar
931        } else {
932            appendHex(b, 2, s);
933        }
934    }
935    return s;
936}
937
938UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
939    int32_t length;
940    const uint8_t *p = key.getByteArray(length);
941    return printSortKey(p, length);
942}
943
944UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
945    int32_t lineLength;
946    const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
947    if(line == NULL || errorCode.isFailure()) {
948        fileLine.remove();
949        return FALSE;
950    }
951    ++fileLineNumber;
952    // Strip trailing CR/LF, comments, and spaces.
953    const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
954    if(comment != NULL) {
955        lineLength = (int32_t)(comment - line);
956    } else {
957        while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
958    }
959    while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
960    fileLine.setTo(FALSE, line, lineLength);
961    return TRUE;
962}
963
964void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
965                                UErrorCode &errorCode) {
966    int32_t length = fileLine.length();
967    int32_t i;
968    for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
969    int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
970    if(pipeIndex >= 0) {
971        prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
972        if(prefix.isEmpty()) {
973            errln("empty prefix on line %d", (int)fileLineNumber);
974            infoln(fileLine);
975            errorCode = U_PARSE_ERROR;
976            return;
977        }
978        start = pipeIndex + 1;
979    } else {
980        prefix.remove();
981    }
982    s = fileLine.tempSubStringBetween(start, i).unescape();
983    if(s.isEmpty()) {
984        errln("empty string on line %d", (int)fileLineNumber);
985        infoln(fileLine);
986        errorCode = U_PARSE_ERROR;
987        return;
988    }
989    start = i;
990}
991
992Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
993    Collation::Level relation;
994    int32_t start;
995    if(fileLine[0] == 0x3c) {  // <
996        UChar second = fileLine[1];
997        start = 2;
998        switch(second) {
999        case 0x31:  // <1
1000            relation = Collation::PRIMARY_LEVEL;
1001            break;
1002        case 0x32:  // <2
1003            relation = Collation::SECONDARY_LEVEL;
1004            break;
1005        case 0x33:  // <3
1006            relation = Collation::TERTIARY_LEVEL;
1007            break;
1008        case 0x34:  // <4
1009            relation = Collation::QUATERNARY_LEVEL;
1010            break;
1011        case 0x63:  // <c
1012            relation = Collation::CASE_LEVEL;
1013            break;
1014        case 0x69:  // <i
1015            relation = Collation::IDENTICAL_LEVEL;
1016            break;
1017        default:  // just <
1018            relation = Collation::NO_LEVEL;
1019            start = 1;
1020            break;
1021        }
1022    } else if(fileLine[0] == 0x3d) {  // =
1023        relation = Collation::ZERO_LEVEL;
1024        start = 1;
1025    } else {
1026        start = 0;
1027    }
1028    if(start == 0 || !isSpace(fileLine[start])) {
1029        errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030        infoln(fileLine);
1031        errorCode.set(U_PARSE_ERROR);
1032        return Collation::NO_LEVEL;
1033    }
1034    start = skipSpaces(start);
1035    UnicodeString prefix;
1036    parseString(start, prefix, s, errorCode);
1037    if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038        errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039        infoln(fileLine);
1040        errorCode.set(U_PARSE_ERROR);
1041        return Collation::NO_LEVEL;
1042    }
1043    if(start < fileLine.length()) {
1044        errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045        infoln(fileLine);
1046        errorCode.set(U_PARSE_ERROR);
1047        return Collation::NO_LEVEL;
1048    }
1049    return relation;
1050}
1051
1052static const struct {
1053    const char *name;
1054    UColAttribute attr;
1055} attributes[] = {
1056    { "backwards", UCOL_FRENCH_COLLATION },
1057    { "alternate", UCOL_ALTERNATE_HANDLING },
1058    { "caseFirst", UCOL_CASE_FIRST },
1059    { "caseLevel", UCOL_CASE_LEVEL },
1060    // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061    { "strength", UCOL_STRENGTH },
1062    // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063    { "numeric", UCOL_NUMERIC_COLLATION }
1064};
1065
1066static const struct {
1067    const char *name;
1068    UColAttributeValue value;
1069} attributeValues[] = {
1070    { "default", UCOL_DEFAULT },
1071    { "primary", UCOL_PRIMARY },
1072    { "secondary", UCOL_SECONDARY },
1073    { "tertiary", UCOL_TERTIARY },
1074    { "quaternary", UCOL_QUATERNARY },
1075    { "identical", UCOL_IDENTICAL },
1076    { "off", UCOL_OFF },
1077    { "on", UCOL_ON },
1078    { "shifted", UCOL_SHIFTED },
1079    { "non-ignorable", UCOL_NON_IGNORABLE },
1080    { "lower", UCOL_LOWER_FIRST },
1081    { "upper", UCOL_UPPER_FIRST }
1082};
1083
1084void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085    int32_t start = skipSpaces(1);
1086    int32_t equalPos = fileLine.indexOf(0x3d);
1087    if(equalPos < 0) {
1088        if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089            parseAndSetReorderCodes(start + 7, errorCode);
1090            return;
1091        }
1092        errln("missing '=' on line %d", (int)fileLineNumber);
1093        infoln(fileLine);
1094        errorCode.set(U_PARSE_ERROR);
1095        return;
1096    }
1097
1098    UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099    UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100    if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101        UColReorderCode max;
1102        if(valueString == UNICODE_STRING("space", 5)) {
1103            max = UCOL_REORDER_CODE_SPACE;
1104        } else if(valueString == UNICODE_STRING("punct", 5)) {
1105            max = UCOL_REORDER_CODE_PUNCTUATION;
1106        } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107            max = UCOL_REORDER_CODE_SYMBOL;
1108        } else if(valueString == UNICODE_STRING("currency", 8)) {
1109            max = UCOL_REORDER_CODE_CURRENCY;
1110        } else {
1111            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112            infoln(fileLine);
1113            errorCode.set(U_PARSE_ERROR);
1114            return;
1115        }
1116        coll->setMaxVariable(max, errorCode);
1117        if(errorCode.isFailure()) {
1118            errln("setMaxVariable() failed on line %d: %s",
1119                  (int)fileLineNumber, errorCode.errorName());
1120            infoln(fileLine);
1121            return;
1122        }
1123        fileLine.remove();
1124        return;
1125    }
1126
1127    UColAttribute attr;
1128    for(int32_t i = 0;; ++i) {
1129        if(i == LENGTHOF(attributes)) {
1130            errln("invalid attribute name on line %d", (int)fileLineNumber);
1131            infoln(fileLine);
1132            errorCode.set(U_PARSE_ERROR);
1133            return;
1134        }
1135        if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136            attr = attributes[i].attr;
1137            break;
1138        }
1139    }
1140
1141    UColAttributeValue value;
1142    for(int32_t i = 0;; ++i) {
1143        if(i == LENGTHOF(attributeValues)) {
1144            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145            infoln(fileLine);
1146            errorCode.set(U_PARSE_ERROR);
1147            return;
1148        }
1149        if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150            value = attributeValues[i].value;
1151            break;
1152        }
1153    }
1154
1155    coll->setAttribute(attr, value, errorCode);
1156    if(errorCode.isFailure()) {
1157        errln("illegal attribute=value combination on line %d: %s",
1158              (int)fileLineNumber, errorCode.errorName());
1159        infoln(fileLine);
1160        return;
1161    }
1162    fileLine.remove();
1163}
1164
1165void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166    UVector32 reorderCodes(errorCode);
1167    while(start < fileLine.length()) {
1168        start = skipSpaces(start);
1169        int32_t limit = start;
1170        while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171        CharString name;
1172        name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173        int32_t code = CollationRuleParser::getReorderCode(name.data());
1174        if(code < -1) {
1175            errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176            infoln(fileLine);
1177            errorCode.set(U_PARSE_ERROR);
1178            return;
1179        }
1180        reorderCodes.addElement(code, errorCode);
1181        start = limit;
1182    }
1183    coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184    if(errorCode.isFailure()) {
1185        errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186        infoln(fileLine);
1187        return;
1188    }
1189    fileLine.remove();
1190}
1191
1192void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193    UnicodeString rules;
1194    while(readLine(f, errorCode)) {
1195        if(fileLine.isEmpty()) { continue; }
1196        if(isSectionStarter(fileLine[0])) { break; }
1197        rules.append(fileLine.unescape());
1198    }
1199    if(errorCode.isFailure()) { return; }
1200    logln(rules);
1201
1202    UParseError parseError;
1203    UnicodeString reason;
1204    delete coll;
1205    coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206    if(coll == NULL) {
1207        errln("unable to allocate a new collator");
1208        errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209        return;
1210    }
1211    if(errorCode.isFailure()) {
1212        errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213        infoln(UnicodeString("  reason: ") + reason);
1214        if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1215        if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216            infoln(UnicodeString("  snippet: ...") +
1217                parseError.preContext + "(!)" + parseError.postContext + "...");
1218        }
1219    } else {
1220        assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221                     UnicodeString(), reason);
1222    }
1223}
1224
1225void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226    if(errorCode.isFailure()) { return; }
1227    delete coll;
1228    coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229    if(errorCode.isFailure()) {
1230        dataerrln("unable to create a root collator");
1231        return;
1232    }
1233}
1234
1235void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236    if(errorCode.isFailure()) { return; }
1237    CharString langTag;
1238    langTag.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1239    char localeID[ULOC_FULLNAME_CAPACITY];
1240    int32_t parsedLength;
1241    (void)uloc_forLanguageTag(
1242        langTag.data(), localeID, LENGTHOF(localeID), &parsedLength, errorCode);
1243    Locale locale(localeID);
1244    if(fileLine.length() == 9 ||
1245            errorCode.isFailure() || errorCode.get() == U_STRING_NOT_TERMINATED_WARNING ||
1246            parsedLength != langTag.length() || locale.isBogus()) {
1247        errln("invalid language tag on line %d", (int)fileLineNumber);
1248        infoln(fileLine);
1249        if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1250        return;
1251    }
1252
1253    logln("creating a collator for locale ID %s", locale.getName());
1254    Collator *newColl = Collator::createInstance(locale, errorCode);
1255    if(errorCode.isFailure()) {
1256        dataerrln("unable to create a collator for locale %s on line %d",
1257                  locale.getName(), (int)fileLineNumber);
1258        infoln(fileLine);
1259        return;
1260    }
1261    delete coll;
1262    coll = newColl;
1263}
1264
1265UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1266    if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1267    // In some sequences with Tibetan composite vowel signs,
1268    // even if the string passes the FCD check,
1269    // those composites must be decomposed.
1270    // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1271    int32_t index = 0;
1272    while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1273        if(++index < s.length()) {
1274            UChar c = s[index];
1275            if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1276        }
1277    }
1278    return FALSE;
1279}
1280
1281UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1282                                     CharString &dest, int32_t partSize,
1283                                     IcuTestErrorCode &errorCode) {
1284    if(errorCode.isFailure()) { return FALSE; }
1285    uint8_t part[32];
1286    U_ASSERT(partSize <= LENGTHOF(part));
1287    UCharIterator iter;
1288    uiter_setString(&iter, s, length);
1289    uint32_t state[2] = { 0, 0 };
1290    for(;;) {
1291        int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1292        UBool done = partLength < partSize;
1293        if(done) {
1294            // At the end, append the next byte as well which should be 00.
1295            ++partLength;
1296        }
1297        dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1298        if(done) {
1299            return errorCode.isSuccess();
1300        }
1301    }
1302}
1303
1304UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1305                                     const UChar *s, int32_t length,
1306                                     CollationKey &key, IcuTestErrorCode &errorCode) {
1307    if(errorCode.isFailure()) { return FALSE; }
1308    coll->getCollationKey(s, length, key, errorCode);
1309    if(errorCode.isFailure()) {
1310        infoln(fileTestName);
1311        errln("Collator(%s).getCollationKey() failed: %s",
1312              norm, errorCode.errorName());
1313        infoln(line);
1314        return FALSE;
1315    }
1316    int32_t keyLength;
1317    const uint8_t *keyBytes = key.getByteArray(keyLength);
1318    if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1319        infoln(fileTestName);
1320        errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1321              norm);
1322        infoln(line);
1323        infoln(printCollationKey(key));
1324        return FALSE;
1325    }
1326
1327    int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1328    if(numLevels < UCOL_IDENTICAL) {
1329        ++numLevels;
1330    } else {
1331        numLevels = 5;
1332    }
1333    if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1334        ++numLevels;
1335    }
1336    errorCode.assertSuccess();
1337    int32_t numLevelSeparators = 0;
1338    for(int32_t i = 0; i < (keyLength - 1); ++i) {
1339        uint8_t b = keyBytes[i];
1340        if(b == 0) {
1341            infoln(fileTestName);
1342            errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1343            infoln(line);
1344            infoln(printCollationKey(key));
1345            return FALSE;
1346        }
1347        if(b == 1) { ++numLevelSeparators; }
1348    }
1349    if(numLevelSeparators != (numLevels - 1)) {
1350        infoln(fileTestName);
1351        errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1352              norm, (int)numLevelSeparators, (int)numLevels);
1353        infoln(line);
1354        infoln(printCollationKey(key));
1355        return FALSE;
1356    }
1357
1358    // If s contains U+FFFE, check that merged segments make the same key.
1359    LocalMemory<uint8_t> mergedKey;
1360    int32_t mergedKeyLength = 0;
1361    int32_t mergedKeyCapacity = 0;
1362    int32_t sLength = (length >= 0) ? length : u_strlen(s);
1363    int32_t segmentStart = 0;
1364    for(int32_t i = 0;;) {
1365        if(i == sLength) {
1366            if(segmentStart == 0) {
1367                // s does not contain any U+FFFE.
1368                break;
1369            }
1370        } else if(s[i] != 0xfffe) {
1371            ++i;
1372            continue;
1373        }
1374        // Get the sort key for another segment and merge it into mergedKey.
1375        CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1376        CollationKey key2;
1377        coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1378        int32_t key1Length, key2Length;
1379        const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1380        const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1381        uint8_t *dest;
1382        int32_t minCapacity = key1Length + key2Length;
1383        if(key1Length > 0) { --minCapacity; }
1384        if(minCapacity <= mergedKeyCapacity) {
1385            dest = mergedKey.getAlias();
1386        } else {
1387            if(minCapacity <= 200) {
1388                mergedKeyCapacity = 200;
1389            } else if(minCapacity <= 2 * mergedKeyCapacity) {
1390                mergedKeyCapacity *= 2;
1391            } else {
1392                mergedKeyCapacity = minCapacity;
1393            }
1394            dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1395        }
1396        U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1397        if(key1Length == 0) {
1398            // key2 is the sort key for the first segment.
1399            uprv_memcpy(dest, key2Bytes, key2Length);
1400            mergedKeyLength = key2Length;
1401        } else {
1402            mergedKeyLength =
1403                ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1404                                   dest, mergedKeyCapacity);
1405        }
1406        if(i == sLength) { break; }
1407        segmentStart = ++i;
1408    }
1409    if(segmentStart != 0 &&
1410            (mergedKeyLength != keyLength ||
1411            uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1412        infoln(fileTestName);
1413        errln("Collator(%s).getCollationKey(with U+FFFE) != "
1414              "ucol_mergeSortkeys(segments)",
1415              norm);
1416        infoln(line);
1417        infoln(printCollationKey(key));
1418        infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1419        return FALSE;
1420    }
1421
1422    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1423    static const int32_t partSizes[] = { 32, 3, 1 };
1424    for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1425        int32_t partSize = partSizes[psi];
1426        CharString parts;
1427        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1428            infoln(fileTestName);
1429            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1430                  norm, (int)partSize, errorCode.errorName());
1431            infoln(line);
1432            return FALSE;
1433        }
1434        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1435            infoln(fileTestName);
1436            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1437                  norm, (int)partSize);
1438            infoln(line);
1439            infoln(printCollationKey(key));
1440            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1441            return FALSE;
1442        }
1443    }
1444    return TRUE;
1445}
1446
1447namespace {
1448
1449/**
1450 * Replaces unpaired surrogates with U+FFFD.
1451 * Returns s if no replacement was made, otherwise buffer.
1452 */
1453const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1454    int32_t i = 0;
1455    while(i < s.length()) {
1456        UChar32 c = s.char32At(i);
1457        if(U_IS_SURROGATE(c)) {
1458            if(buffer.length() < i) {
1459                buffer.append(s, buffer.length(), i - buffer.length());
1460            }
1461            buffer.append((UChar)0xfffd);
1462        }
1463        i += U16_LENGTH(c);
1464    }
1465    if(buffer.isEmpty()) {
1466        return s;
1467    }
1468    if(buffer.length() < i) {
1469        buffer.append(s, buffer.length(), i - buffer.length());
1470    }
1471    return buffer;
1472}
1473
1474}
1475
1476UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1477                                     const UnicodeString &prevString, const UnicodeString &s,
1478                                     UCollationResult expectedOrder, Collation::Level expectedLevel,
1479                                     IcuTestErrorCode &errorCode) {
1480    if(errorCode.isFailure()) { return FALSE; }
1481
1482    // Get the sort keys first, for error debug output.
1483    CollationKey prevKey;
1484    if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1485                        prevKey, errorCode)) {
1486        return FALSE;
1487    }
1488    CollationKey key;
1489    if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1490
1491    UCollationResult order = coll->compare(prevString, s, errorCode);
1492    if(order != expectedOrder || errorCode.isFailure()) {
1493        infoln(fileTestName);
1494        errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1495              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1496        infoln(prevFileLine);
1497        infoln(fileLine);
1498        infoln(printCollationKey(prevKey));
1499        infoln(printCollationKey(key));
1500        return FALSE;
1501    }
1502    order = coll->compare(s, prevString, errorCode);
1503    if(order != -expectedOrder || errorCode.isFailure()) {
1504        infoln(fileTestName);
1505        errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1506              (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1507        infoln(prevFileLine);
1508        infoln(fileLine);
1509        infoln(printCollationKey(prevKey));
1510        infoln(printCollationKey(key));
1511        return FALSE;
1512    }
1513    // Test NUL-termination if the strings do not contain NUL characters.
1514    UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1515    if(!containNUL) {
1516        order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1517        if(order != expectedOrder || errorCode.isFailure()) {
1518            infoln(fileTestName);
1519            errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1520                  (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1521            infoln(prevFileLine);
1522            infoln(fileLine);
1523            infoln(printCollationKey(prevKey));
1524            infoln(printCollationKey(key));
1525            return FALSE;
1526        }
1527        order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1528        if(order != -expectedOrder || errorCode.isFailure()) {
1529            infoln(fileTestName);
1530            errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1531                  (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1532            infoln(prevFileLine);
1533            infoln(fileLine);
1534            infoln(printCollationKey(prevKey));
1535            infoln(printCollationKey(key));
1536            return FALSE;
1537        }
1538    }
1539
1540#if U_HAVE_STD_STRING
1541    // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1542    // Unpaired surrogates cannot be converted to UTF-8.
1543    // Create valid UTF-16 strings if necessary, and use those for
1544    // both the expected compare() result and for the input to compare(UTF-8).
1545    UnicodeString prevBuffer, sBuffer;
1546    const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1547    const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1548    std::string prevUTF8, sUTF8;
1549    UnicodeString(prevValid).toUTF8String(prevUTF8);
1550    UnicodeString(sValid).toUTF8String(sUTF8);
1551    UCollationResult expectedUTF8Order;
1552    if(&prevValid == &prevString && &sValid == &s) {
1553        expectedUTF8Order = expectedOrder;
1554    } else {
1555        expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1556    }
1557
1558    order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1559    if(order != expectedUTF8Order || errorCode.isFailure()) {
1560        infoln(fileTestName);
1561        errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1562              (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1563        infoln(prevFileLine);
1564        infoln(fileLine);
1565        infoln(printCollationKey(prevKey));
1566        infoln(printCollationKey(key));
1567        return FALSE;
1568    }
1569    order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1570    if(order != -expectedUTF8Order || errorCode.isFailure()) {
1571        infoln(fileTestName);
1572        errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1573              (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1574        infoln(prevFileLine);
1575        infoln(fileLine);
1576        infoln(printCollationKey(prevKey));
1577        infoln(printCollationKey(key));
1578        return FALSE;
1579    }
1580    // Test NUL-termination if the strings do not contain NUL characters.
1581    if(!containNUL) {
1582        order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1583        if(order != expectedUTF8Order || errorCode.isFailure()) {
1584            infoln(fileTestName);
1585            errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586                  (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1587            infoln(prevFileLine);
1588            infoln(fileLine);
1589            infoln(printCollationKey(prevKey));
1590            infoln(printCollationKey(key));
1591            return FALSE;
1592        }
1593        order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1594        if(order != -expectedUTF8Order || errorCode.isFailure()) {
1595            infoln(fileTestName);
1596            errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597                  (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1598            infoln(prevFileLine);
1599            infoln(fileLine);
1600            infoln(printCollationKey(prevKey));
1601            infoln(printCollationKey(key));
1602            return FALSE;
1603        }
1604    }
1605#endif
1606
1607    UCharIterator leftIter;
1608    UCharIterator rightIter;
1609    uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1610    uiter_setString(&rightIter, s.getBuffer(), s.length());
1611    order = coll->compare(leftIter, rightIter, errorCode);
1612    if(order != expectedOrder || errorCode.isFailure()) {
1613        infoln(fileTestName);
1614        errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1615              "wrong order: %d != %d (%s)",
1616              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1617        infoln(prevFileLine);
1618        infoln(fileLine);
1619        infoln(printCollationKey(prevKey));
1620        infoln(printCollationKey(key));
1621        return FALSE;
1622    }
1623
1624    order = prevKey.compareTo(key, errorCode);
1625    if(order != expectedOrder || errorCode.isFailure()) {
1626        infoln(fileTestName);
1627        errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1628              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1629        infoln(prevFileLine);
1630        infoln(fileLine);
1631        infoln(printCollationKey(prevKey));
1632        infoln(printCollationKey(key));
1633        return FALSE;
1634    }
1635    if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1636        int32_t prevKeyLength;
1637        const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1638        int32_t keyLength;
1639        const uint8_t *bytes = key.getByteArray(keyLength);
1640        int32_t level = Collation::PRIMARY_LEVEL;
1641        for(int32_t i = 0;; ++i) {
1642            uint8_t b = prevBytes[i];
1643            if(b != bytes[i]) { break; }
1644            if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1645                ++level;
1646                if(level == Collation::CASE_LEVEL &&
1647                        coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1648                    ++level;
1649                }
1650            }
1651        }
1652        if(level != expectedLevel) {
1653            infoln(fileTestName);
1654            errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1655                  (int)fileLineNumber, norm, order, level, expectedLevel);
1656            infoln(prevFileLine);
1657            infoln(fileLine);
1658            infoln(printCollationKey(prevKey));
1659            infoln(printCollationKey(key));
1660            return FALSE;
1661        }
1662    }
1663    return TRUE;
1664}
1665
1666void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1667    if(errorCode.isFailure()) { return; }
1668    UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1669    UnicodeString prevString, s;
1670    prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1671    while(readLine(f, errorCode)) {
1672        if(fileLine.isEmpty()) { continue; }
1673        if(isSectionStarter(fileLine[0])) { break; }
1674        Collation::Level relation = parseRelationAndString(s, errorCode);
1675        if(errorCode.isFailure()) {
1676            errorCode.reset();
1677            break;
1678        }
1679        UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1680        Collation::Level expectedLevel = relation;
1681        s.getTerminatedBuffer();  // Ensure NUL-termination.
1682        UBool isOk = TRUE;
1683        if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1684            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1685            isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1686                                   expectedOrder, expectedLevel, errorCode);
1687        }
1688        if(isOk) {
1689            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1690            isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1691                                   expectedOrder, expectedLevel, errorCode);
1692        }
1693        if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1694            UnicodeString pn = nfd->normalize(prevString, errorCode);
1695            UnicodeString n = nfd->normalize(s, errorCode);
1696            pn.getTerminatedBuffer();
1697            n.getTerminatedBuffer();
1698            errorCode.assertSuccess();
1699            isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1700                                   expectedOrder, expectedLevel, errorCode);
1701        }
1702        if(!isOk) {
1703            errorCode.reset();  // already reported
1704        }
1705        prevFileLine = fileLine;
1706        prevString = s;
1707        prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1708    }
1709}
1710
1711void CollationTest::TestDataDriven() {
1712    IcuTestErrorCode errorCode(*this, "TestDataDriven");
1713
1714    fcd = Normalizer2Factory::getFCDInstance(errorCode);
1715    nfd = Normalizer2Factory::getNFDInstance(errorCode);
1716    if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1717        return;
1718    }
1719
1720    CharString path(getSourceTestData(errorCode), errorCode);
1721    path.appendPathPart("collationtest.txt", errorCode);
1722    const char *codePage = "UTF-8";
1723    LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1724    if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1725        return;
1726    }
1727    while(errorCode.isSuccess()) {
1728        // Read a new line if necessary.
1729        // Sub-parsers leave the first line set that they do not handle.
1730        if(fileLine.isEmpty()) {
1731            if(!readLine(f.getAlias(), errorCode)) { break; }
1732            continue;
1733        }
1734        if(!isSectionStarter(fileLine[0])) {
1735            errln("syntax error on line %d", (int)fileLineNumber);
1736            infoln(fileLine);
1737            return;
1738        }
1739        if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1740            fileTestName = fileLine;
1741            logln(fileLine);
1742            fileLine.remove();
1743        } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1744            setRootCollator(errorCode);
1745            fileLine.remove();
1746        } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1747            setLocaleCollator(errorCode);
1748            fileLine.remove();
1749        } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1750            buildTailoring(f.getAlias(), errorCode);
1751        } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1752            parseAndSetAttribute(errorCode);
1753        } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1754            checkCompareStrings(f.getAlias(), errorCode);
1755        } else {
1756            errln("syntax error on line %d", (int)fileLineNumber);
1757            infoln(fileLine);
1758            return;
1759        }
1760    }
1761}
1762
1763#endif  // !UCONFIG_NO_COLLATION
1764