1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "cmemory.h"
13#include "cstring.h"
14#include "ucol_imp.h"
15
16#include "unicode/coll.h"
17#include "unicode/tblcoll.h"
18#include "unicode/usearch.h"
19#include "unicode/uset.h"
20#include "unicode/ustring.h"
21
22#include "unicode/coleitr.h"
23#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
24
25#include "colldata.h"
26#include "ssearch.h"
27#include "xmlparser.h"
28
29#include <stdio.h>  // for sprintf
30
31char testId[100];
32
33#define TEST_ASSERT(x) {if (!(x)) { \
34    errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
35
36#define TEST_ASSERT_M(x, m) {if (!(x)) { \
37    dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
38
39#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
40    dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
41          __FILE__, __LINE__, testId, u_errorName(errcode));}}
42
43#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
44#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
45#define DELETE_ARRAY(array) uprv_free((void *) (array))
46
47//---------------------------------------------------------------------------
48//
49//  Test class boilerplate
50//
51//---------------------------------------------------------------------------
52SSearchTest::SSearchTest()
53{
54}
55
56SSearchTest::~SSearchTest()
57{
58}
59
60void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
61{
62    if (exec) logln("TestSuite SSearchTest: ");
63    switch (index) {
64#if !UCONFIG_NO_BREAK_ITERATION
65       case 0: name = "searchTest";
66            if (exec) searchTest();
67            break;
68
69        case 1: name = "offsetTest";
70            if (exec) offsetTest();
71            break;
72
73        case 2: name = "monkeyTest";
74            if (exec) monkeyTest(params);
75            break;
76
77        case 3: name = "sharpSTest";
78            if (exec) sharpSTest();
79            break;
80
81        case 4: name = "goodSuffixTest";
82            if (exec) goodSuffixTest();
83            break;
84
85        case 5: name = "searchTime";
86            if (exec) searchTime();
87            break;
88#endif
89        default: name = "";
90            break; //needed to end loop
91    }
92}
93
94
95#if !UCONFIG_NO_BREAK_ITERATION
96
97#define PATH_BUFFER_SIZE 2048
98const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
99    UErrorCode status = U_ZERO_ERROR;
100    const char *testDataDirectory = IntlTest::getSourceTestData(status);
101
102    if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
103        errln("ERROR: getPath() failed - %s", u_errorName(status));
104        return NULL;
105    }
106
107    strcpy(buffer, testDataDirectory);
108    strcat(buffer, filename);
109    return buffer;
110}
111
112
113void SSearchTest::searchTest()
114{
115#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116    UErrorCode status = U_ZERO_ERROR;
117    char path[PATH_BUFFER_SIZE];
118    const char *testFilePath = getPath(path, "ssearch.xml");
119
120    if (testFilePath == NULL) {
121        return; /* Couldn't get path: error message already output. */
122    }
123
124    LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
125    TEST_ASSERT_SUCCESS(status);
126    LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
127    TEST_ASSERT_SUCCESS(status);
128    if (U_FAILURE(status)) {
129        return;
130    }
131
132    const UnicodeString *debugTestCase = root->getAttribute("debug");
133    if (debugTestCase != NULL) {
134//       setenv("USEARCH_DEBUG", "1", 1);
135    }
136
137
138    const UXMLElement *testCase;
139    int32_t tc = 0;
140
141    while((testCase = root->nextChildElement(tc)) != NULL) {
142
143        if (testCase->getTagName().compare("test-case") != 0) {
144            errln("ssearch, unrecognized XML Element in test file");
145            continue;
146        }
147        const UnicodeString *id       = testCase->getAttribute("id");
148        *testId = 0;
149        if (id != NULL) {
150            id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
151        }
152
153        // If debugging test case has been specified and this is not it, skip to next.
154        if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
155            continue;
156        }
157        //
158        //  Get the requested collation strength.
159        //    Default is tertiary if the XML attribute is missing from the test case.
160        //
161        const UnicodeString *strength = testCase->getAttribute("strength");
162        UColAttributeValue collatorStrength = UCOL_PRIMARY;
163        if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
164        else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
165        else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
166        else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
167        else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
168        else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
169        else {
170            // Bogus value supplied for strength.  Shouldn't happen, even from
171            //  typos, if the  XML source has been validated.
172            //  This assert is a little deceiving in that strength can be
173            //   any of the allowed values, not just TERTIARY, but it will
174            //   do the job of getting the error output.
175            TEST_ASSERT(*strength=="TERTIARY")
176        }
177
178        //
179        // Get the collator normalization flag.  Default is UCOL_OFF.
180        //
181        UColAttributeValue normalize = UCOL_OFF;
182        const UnicodeString *norm = testCase->getAttribute("norm");
183        TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
184        if (norm!=NULL && *norm=="ON") {
185            normalize = UCOL_ON;
186        }
187
188        //
189        // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
190        //
191        UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
192        const UnicodeString *alt = testCase->getAttribute("alternate_handling");
193        TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
194        if (alt != NULL && *alt == "SHIFTED") {
195            alternateHandling = UCOL_SHIFTED;
196        }
197
198        const UnicodeString defLocale("en");
199        char  clocale[100];
200        const UnicodeString *locale   = testCase->getAttribute("locale");
201        if (locale == NULL || locale->length()==0) {
202            locale = &defLocale;
203        };
204        locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
205
206
207        UnicodeString  text;
208        UnicodeString  target;
209        UnicodeString  pattern;
210        int32_t        expectedMatchStart = -1;
211        int32_t        expectedMatchLimit = -1;
212        const UXMLElement  *n;
213        int32_t                nodeCount = 0;
214
215        n = testCase->getChildElement("pattern");
216        TEST_ASSERT(n != NULL);
217        if (n==NULL) {
218            continue;
219        }
220        text = n->getText(FALSE);
221        text = text.unescape();
222        pattern.append(text);
223        nodeCount++;
224
225        n = testCase->getChildElement("pre");
226        if (n!=NULL) {
227            text = n->getText(FALSE);
228            text = text.unescape();
229            target.append(text);
230            nodeCount++;
231        }
232
233        n = testCase->getChildElement("m");
234        if (n!=NULL) {
235            expectedMatchStart = target.length();
236            text = n->getText(FALSE);
237            text = text.unescape();
238            target.append(text);
239            expectedMatchLimit = target.length();
240            nodeCount++;
241        }
242
243        n = testCase->getChildElement("post");
244        if (n!=NULL) {
245            text = n->getText(FALSE);
246            text = text.unescape();
247            target.append(text);
248            nodeCount++;
249        }
250
251        //  Check that there weren't extra things in the XML
252        TEST_ASSERT(nodeCount == testCase->countChildren());
253
254        // Open a collator and StringSearch based on the parameters
255        //   obtained from the XML.
256        //
257        status = U_ZERO_ERROR;
258        LocalUCollatorPointer collator(ucol_open(clocale, &status));
259        ucol_setStrength(collator.getAlias(), collatorStrength);
260        ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
261        ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
262        LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
263                                                               target.getBuffer(), target.length(),
264                                                               collator.getAlias(),
265                                                               NULL,     // the break iterator
266                                                               &status));
267
268        TEST_ASSERT_SUCCESS(status);
269        if (U_FAILURE(status)) {
270            continue;
271        }
272
273        int32_t foundStart = 0;
274        int32_t foundLimit = 0;
275        UBool   foundMatch;
276
277        //
278        // Do the search, check the match result against the expected results.
279        //
280        foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
281        TEST_ASSERT_SUCCESS(status);
282        if ((foundMatch && expectedMatchStart<0) ||
283            (foundStart != expectedMatchStart)   ||
284            (foundLimit != expectedMatchLimit)) {
285                TEST_ASSERT(FALSE);   //  ouput generic error position
286                infoln("Found, expected match start = %d, %d \n"
287                       "Found, expected match limit = %d, %d",
288                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
289        }
290
291        // In case there are other matches...
292        // (should we only do this if the test case passed?)
293        while (foundMatch) {
294            expectedMatchStart = foundStart;
295            expectedMatchLimit = foundLimit;
296
297            foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
298        }
299
300        uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
301            target.getBuffer(), target.length(),
302            collator.getAlias(),
303            NULL,
304            &status));
305
306        //
307        // Do the backwards search, check the match result against the expected results.
308        //
309        foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
310        TEST_ASSERT_SUCCESS(status);
311        if ((foundMatch && expectedMatchStart<0) ||
312            (foundStart != expectedMatchStart)   ||
313            (foundLimit != expectedMatchLimit)) {
314                TEST_ASSERT(FALSE);   //  ouput generic error position
315                infoln("Found, expected backwards match start = %d, %d \n"
316                       "Found, expected backwards match limit = %d, %d",
317                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
318        }
319    }
320#endif
321}
322
323struct Order
324{
325    int32_t order;
326    int32_t lowOffset;
327    int32_t highOffset;
328};
329
330class OrderList
331{
332public:
333    OrderList();
334    OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
335    ~OrderList();
336
337    int32_t size(void) const;
338    void add(int32_t order, int32_t low, int32_t high);
339    const Order *get(int32_t index) const;
340    int32_t getLowOffset(int32_t index) const;
341    int32_t getHighOffset(int32_t index) const;
342    int32_t getOrder(int32_t index) const;
343    void reverse(void);
344    UBool compare(const OrderList &other) const;
345    UBool matchesAt(int32_t offset, const OrderList &other) const;
346
347private:
348    Order *list;
349    int32_t listMax;
350    int32_t listSize;
351};
352
353OrderList::OrderList()
354  : list(NULL),  listMax(16), listSize(0)
355{
356    list = new Order[listMax];
357}
358
359OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
360    : list(NULL), listMax(16), listSize(0)
361{
362    UErrorCode status = U_ZERO_ERROR;
363    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
364    uint32_t strengthMask = 0;
365    int32_t order, low, high;
366
367    switch (ucol_getStrength(coll))
368    {
369    default:
370        strengthMask |= UCOL_TERTIARYORDERMASK;
371        /* fall through */
372
373    case UCOL_SECONDARY:
374        strengthMask |= UCOL_SECONDARYORDERMASK;
375        /* fall through */
376
377    case UCOL_PRIMARY:
378        strengthMask |= UCOL_PRIMARYORDERMASK;
379    }
380
381    list = new Order[listMax];
382
383    ucol_setOffset(elems, stringOffset, &status);
384
385    do {
386        low   = ucol_getOffset(elems);
387        order = ucol_next(elems, &status);
388        high  = ucol_getOffset(elems);
389
390        if (order != UCOL_NULLORDER) {
391            order &= strengthMask;
392        }
393
394        if (order != UCOL_IGNORABLE) {
395            add(order, low, high);
396        }
397    } while (order != UCOL_NULLORDER);
398
399    ucol_closeElements(elems);
400}
401
402OrderList::~OrderList()
403{
404    delete[] list;
405}
406
407void OrderList::add(int32_t order, int32_t low, int32_t high)
408{
409    if (listSize >= listMax) {
410        listMax *= 2;
411
412        Order *newList = new Order[listMax];
413
414        uprv_memcpy(newList, list, listSize * sizeof(Order));
415        delete[] list;
416        list = newList;
417    }
418
419    list[listSize].order      = order;
420    list[listSize].lowOffset  = low;
421    list[listSize].highOffset = high;
422
423    listSize += 1;
424}
425
426const Order *OrderList::get(int32_t index) const
427{
428    if (index >= listSize) {
429        return NULL;
430    }
431
432    return &list[index];
433}
434
435int32_t OrderList::getLowOffset(int32_t index) const
436{
437    const Order *order = get(index);
438
439    if (order != NULL) {
440        return order->lowOffset;
441    }
442
443    return -1;
444}
445
446int32_t OrderList::getHighOffset(int32_t index) const
447{
448    const Order *order = get(index);
449
450    if (order != NULL) {
451        return order->highOffset;
452    }
453
454    return -1;
455}
456
457int32_t OrderList::getOrder(int32_t index) const
458{
459    const Order *order = get(index);
460
461    if (order != NULL) {
462        return order->order;
463    }
464
465    return UCOL_NULLORDER;
466}
467
468int32_t OrderList::size() const
469{
470    return listSize;
471}
472
473void OrderList::reverse()
474{
475    for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
476        Order swap = list[b];
477
478        list[b] = list[f];
479        list[f] = swap;
480    }
481}
482
483UBool OrderList::compare(const OrderList &other) const
484{
485    if (listSize != other.listSize) {
486        return FALSE;
487    }
488
489    for(int32_t i = 0; i < listSize; i += 1) {
490        if (list[i].order  != other.list[i].order ||
491            list[i].lowOffset != other.list[i].lowOffset ||
492            list[i].highOffset != other.list[i].highOffset) {
493                return FALSE;
494        }
495    }
496
497    return TRUE;
498}
499
500UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
501{
502    // NOTE: sizes include the NULLORDER, which we don't want to compare.
503    int32_t otherSize = other.size() - 1;
504
505    if (listSize - 1 - offset < otherSize) {
506        return FALSE;
507    }
508
509    for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
510        if (getOrder(i) != other.getOrder(j)) {
511            return FALSE;
512        }
513    }
514
515    return TRUE;
516}
517
518static char *printOffsets(char *buffer, OrderList &list)
519{
520    int32_t size = list.size();
521    char *s = buffer;
522
523    for(int32_t i = 0; i < size; i += 1) {
524        const Order *order = list.get(i);
525
526        if (i != 0) {
527            s += sprintf(s, ", ");
528        }
529
530        s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
531    }
532
533    return buffer;
534}
535
536static char *printOrders(char *buffer, OrderList &list)
537{
538    int32_t size = list.size();
539    char *s = buffer;
540
541    for(int32_t i = 0; i < size; i += 1) {
542        const Order *order = list.get(i);
543
544        if (i != 0) {
545            s += sprintf(s, ", ");
546        }
547
548        s += sprintf(s, "%8.8X", order->order);
549    }
550
551    return buffer;
552}
553
554void SSearchTest::offsetTest()
555{
556    const char *test[] = {
557        // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
558        // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
559        "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
560
561        "\\ua191\\u16ef\\u2036\\u017a",
562
563#if 0
564        // This results in a complex interaction between contraction,
565        // expansion and normalization that confuses the backwards offset fixups.
566        "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
567#endif
568
569        "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
570        "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
571
572        "\\u02FE\\u02FF"
573        "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
574        "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
575        "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
576        "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
577        "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
578
579        "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
580        "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
581        "a\\u02FF\\u0316\\u0301",
582        "a\\u0430\\u0301\\u0316",
583        "a\\u0430\\u0316\\u0301",
584        "abc\\u0E41\\u0301\\u0316",
585        "abc\\u0E41\\u0316\\u0301",
586        "\\u0E41\\u0301\\u0316",
587        "\\u0E41\\u0316\\u0301",
588        "a\\u0301\\u0316",
589        "a\\u0316\\u0301",
590        "\\uAC52\\uAC53",
591        "\\u34CA\\u34CB",
592        "\\u11ED\\u11EE",
593        "\\u30C3\\u30D0",
594        "p\\u00E9ch\\u00E9",
595        "a\\u0301\\u0325",
596        "a\\u0300\\u0325",
597        "a\\u0325\\u0300",
598        "A\\u0323\\u0300B",
599        "A\\u0300\\u0323B",
600        "A\\u0301\\u0323B",
601        "A\\u0302\\u0301\\u0323B",
602        "abc",
603        "ab\\u0300c",
604        "ab\\u0300\\u0323c",
605        " \\uD800\\uDC00\\uDC00",
606        "a\\uD800\\uDC00\\uDC00",
607        "A\\u0301\\u0301",
608        "A\\u0301\\u0323",
609        "A\\u0301\\u0323B",
610        "B\\u0301\\u0323C",
611        "A\\u0300\\u0323B",
612        "\\u0301A\\u0301\\u0301",
613        "abcd\\r\\u0301",
614        "p\\u00EAche",
615        "pe\\u0302che",
616    };
617
618    int32_t testCount = ARRAY_SIZE(test);
619    UErrorCode status = U_ZERO_ERROR;
620    RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
621    if (U_FAILURE(status)) {
622        errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
623        return;
624    }
625    char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
626                        // We could allocate one that's the right size by (CE_count * 10) + 2
627                        // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
628
629    col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
630
631    for(int32_t i = 0; i < testCount; i += 1) {
632        if (!isICUVersionAtLeast(52, 0, 1) && i>=4 && i<=6) {
633            continue; // timebomb until ticket #9156 (was #8081) is resolved
634        }
635        UnicodeString ts = CharsToUnicodeString(test[i]);
636        CollationElementIterator *iter = col->createCollationElementIterator(ts);
637        OrderList forwardList;
638        OrderList backwardList;
639        int32_t order, low, high;
640
641        do {
642            low   = iter->getOffset();
643            order = iter->next(status);
644            high  = iter->getOffset();
645
646            forwardList.add(order, low, high);
647        } while (order != CollationElementIterator::NULLORDER);
648
649        iter->reset();
650        iter->setOffset(ts.length(), status);
651
652        backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
653
654        do {
655            high  = iter->getOffset();
656            order = iter->previous(status);
657            low   = iter->getOffset();
658
659            if (order == CollationElementIterator::NULLORDER) {
660                break;
661            }
662
663            backwardList.add(order, low, high);
664        } while (TRUE);
665
666        backwardList.reverse();
667
668        if (forwardList.compare(backwardList)) {
669            logln("Works with \"%s\"", test[i]);
670            logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
671//          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
672
673            logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
674//          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
675
676            logln();
677        } else {
678            errln("Fails with \"%s\"", test[i]);
679            infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
680            infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
681
682            infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
683            infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
684
685            infoln();
686        }
687        delete iter;
688    }
689    delete col;
690}
691
692#if 0
693static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
694{
695    for(int32_t i = 0; i < string.length(); i += 1) {
696        UChar32 ch = string.char32At(i);
697
698        if (ch >= 0x0020 && ch <= 0x007F) {
699            if (ch == 0x005C) {
700                buffer.append("\\\\");
701            } else {
702                buffer.append(ch);
703            }
704        } else {
705            char cbuffer[12];
706
707            if (ch <= 0xFFFFL) {
708                sprintf(cbuffer, "\\u%4.4X", ch);
709            } else {
710                sprintf(cbuffer, "\\U%8.8X", ch);
711            }
712
713            buffer.append(cbuffer);
714        }
715
716        if (ch >= 0x10000L) {
717            i += 1;
718        }
719    }
720
721    return buffer;
722}
723#endif
724
725void SSearchTest::sharpSTest()
726{
727    UErrorCode status = U_ZERO_ERROR;
728    UCollator *coll = NULL;
729    UnicodeString lp  = "fuss";
730    UnicodeString sp = "fu\\u00DF";
731    UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
732                                "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
733                                "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
734    int32_t start = -1, end = -1;
735
736    coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
737    TEST_ASSERT_SUCCESS(status);
738
739    UnicodeString lpUnescaped = lp.unescape();
740    UnicodeString spUnescaped = sp.unescape();
741
742    LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
743                                                           lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
744                                                           coll,
745                                                           NULL,     // the break iterator
746                                                           &status));
747
748    LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
749                                                           spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
750                                                           coll,
751                                                           NULL,     // the break iterator
752                                                           &status));
753    TEST_ASSERT_SUCCESS(status);
754
755    for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
756        UBool bFound;
757        UnicodeString target = targets[t].unescape();
758
759        start = end = -1;
760        usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
761        bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
762        TEST_ASSERT_SUCCESS(status);
763        if (bFound) {
764            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
765        } else {
766            dataerrln("Test %d: did not find long pattern.", t);
767        }
768
769        usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
770        bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
771        TEST_ASSERT_SUCCESS(status);
772        if (bFound) {
773            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
774        } else {
775            dataerrln("Test %d: did not find long pattern.", t);
776        }
777    }
778
779    ucol_close(coll);
780}
781
782void SSearchTest::goodSuffixTest()
783{
784    UErrorCode status = U_ZERO_ERROR;
785    UCollator *coll = NULL;
786    UnicodeString pat = /*"gcagagag"*/ "fxeld";
787    UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
788    int32_t start = -1, end = -1;
789    UBool bFound;
790
791    coll = ucol_open(NULL, &status);
792    TEST_ASSERT_SUCCESS(status);
793
794    LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
795                                                          target.getBuffer(), target.length(),
796                                                          coll,
797                                                          NULL,     // the break iterator
798                                                          &status));
799    TEST_ASSERT_SUCCESS(status);
800
801    bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
802    TEST_ASSERT_SUCCESS(status);
803    if (bFound) {
804        logln("Found pattern at [%d, %d].", start, end);
805    } else {
806        dataerrln("Did not find pattern.");
807    }
808
809    ucol_close(coll);
810}
811
812//
813//  searchTime()    A quick and dirty performance test for string search.
814//                  Probably  doesn't really belong as part of intltest, but it
815//                  does check that the search succeeds, and gets the right result,
816//                  so it serves as a functionality test also.
817//
818//                  To run as a perf test, up the loop count, select by commenting
819//                  and uncommenting in the code the operation to be measured,
820//                  rebuild, and measure the running time of this test alone.
821//
822//                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
823//
824void SSearchTest::searchTime() {
825    static const char *longishText =
826"Whylom, as olde stories tellen us,\n"
827"Ther was a duk that highte Theseus:\n"
828"Of Athenes he was lord and governour,\n"
829"And in his tyme swich a conquerour,\n"
830"That gretter was ther noon under the sonne.\n"
831"Ful many a riche contree hadde he wonne;\n"
832"What with his wisdom and his chivalrye,\n"
833"He conquered al the regne of Femenye,\n"
834"That whylom was y-cleped Scithia;\n"
835"And weddede the quene Ipolita,\n"
836"And broghte hir hoom with him in his contree\n"
837"With muchel glorie and greet solempnitee,\n"
838"And eek hir yonge suster Emelye.\n"
839"And thus with victorie and with melodye\n"
840"Lete I this noble duk to Athenes ryde,\n"
841"And al his hoost, in armes, him bisyde.\n"
842"And certes, if it nere to long to here,\n"
843"I wolde han told yow fully the manere,\n"
844"How wonnen was the regne of Femenye\n"
845"By Theseus, and by his chivalrye;\n"
846"And of the grete bataille for the nones\n"
847"Bitwixen Athen's and Amazones;\n"
848"And how asseged was Ipolita,\n"
849"The faire hardy quene of Scithia;\n"
850"And of the feste that was at hir weddinge,\n"
851"And of the tempest at hir hoom-cominge;\n"
852"But al that thing I moot as now forbere.\n"
853"I have, God woot, a large feeld to ere,\n"
854"And wayke been the oxen in my plough.\n"
855"The remenant of the tale is long y-nough.\n"
856"I wol nat letten eek noon of this route;\n"
857"Lat every felawe telle his tale aboute,\n"
858"And lat see now who shal the soper winne;\n"
859"And ther I lefte, I wol ageyn biginne.\n"
860"This duk, of whom I make mencioun,\n"
861"When he was come almost unto the toun,\n"
862"In al his wele and in his moste pryde,\n"
863"He was war, as he caste his eye asyde,\n"
864"Wher that ther kneled in the hye weye\n"
865"A companye of ladies, tweye and tweye,\n"
866"Ech after other, clad in clothes blake; \n"
867"But swich a cry and swich a wo they make,\n"
868"That in this world nis creature livinge,\n"
869"That herde swich another weymentinge;\n"
870"And of this cry they nolde never stenten,\n"
871"Til they the reynes of his brydel henten.\n"
872"'What folk ben ye, that at myn hoomcominge\n"
873"Perturben so my feste with cryinge'?\n"
874"Quod Theseus, 'have ye so greet envye\n"
875"Of myn honour, that thus compleyne and crye? \n"
876"Or who hath yow misboden, or offended?\n"
877"And telleth me if it may been amended;\n"
878"And why that ye ben clothed thus in blak'?\n"
879"The eldest lady of hem alle spak,\n"
880"When she hadde swowned with a deedly chere,\n"
881"That it was routhe for to seen and here,\n"
882"And seyde: 'Lord, to whom Fortune hath yiven\n"
883"Victorie, and as a conquerour to liven,\n"
884"Noght greveth us your glorie and your honour;\n"
885"But we biseken mercy and socour.\n"
886"Have mercy on our wo and our distresse.\n"
887"Som drope of pitee, thurgh thy gentilesse,\n"
888"Up-on us wrecched wommen lat thou falle.\n"
889"For certes, lord, ther nis noon of us alle,\n"
890"That she nath been a duchesse or a quene;\n"
891"Now be we caitifs, as it is wel sene:\n"
892"Thanked be Fortune, and hir false wheel,\n"
893"That noon estat assureth to be weel.\n"
894"And certes, lord, t'abyden your presence,\n"
895"Here in the temple of the goddesse Clemence\n"
896"We han ben waytinge al this fourtenight;\n"
897"Now help us, lord, sith it is in thy might.\n"
898"I wrecche, which that wepe and waille thus,\n"
899"Was whylom wyf to king Capaneus,\n"
900"That starf at Thebes, cursed be that day!\n"
901"And alle we, that been in this array,\n"
902"And maken al this lamentacioun,\n"
903"We losten alle our housbondes at that toun,\n"
904"Whyl that the sege ther-aboute lay.\n"
905"And yet now th'olde Creon, weylaway!\n"
906"The lord is now of Thebes the citee, \n"
907"Fulfild of ire and of iniquitee,\n"
908"He, for despyt, and for his tirannye,\n"
909"To do the dede bodyes vileinye,\n"
910"Of alle our lordes, whiche that ben slawe,\n"
911"Hath alle the bodyes on an heep y-drawe,\n"
912"And wol nat suffren hem, by noon assent,\n"
913"Neither to been y-buried nor y-brent,\n"
914"But maketh houndes ete hem in despyt. zet'\n";
915
916const char *cPattern = "maketh houndes ete hem";
917//const char *cPattern = "Whylom";
918//const char *cPattern = "zet";
919    const char *testId = "searchTime()";   // for error macros.
920    UnicodeString target = longishText;
921    UErrorCode status = U_ZERO_ERROR;
922
923
924    LocalUCollatorPointer collator(ucol_open("en", &status));
925    //ucol_setStrength(collator.getAlias(), collatorStrength);
926    //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
927    UnicodeString uPattern = cPattern;
928    LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
929                                                           target.getBuffer(), target.length(),
930                                                           collator.getAlias(),
931                                                           NULL,     // the break iterator
932                                                           &status));
933    TEST_ASSERT_SUCCESS(status);
934
935//  int32_t foundStart;
936//  int32_t foundEnd;
937    UBool   found;
938
939    // Find the match position usgin strstr
940    const char *pm = strstr(longishText, cPattern);
941    TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
942    int32_t  refMatchPos = (int32_t)(pm - longishText);
943    int32_t  icuMatchPos;
944    int32_t  icuMatchEnd;
945    usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
946    TEST_ASSERT_SUCCESS(status);
947    TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
948
949    int32_t i;
950    // int32_t j=0;
951
952    // Try loopcounts around 100000 to some millions, depending on the operation,
953    //   to get runtimes of at least several seconds.
954    for (i=0; i<10000; i++) {
955        found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
956        //TEST_ASSERT_SUCCESS(status);
957        //TEST_ASSERT(found);
958
959        // usearch_setOffset(uss.getAlias(), 0, &status);
960        // icuMatchPos = usearch_next(uss.getAlias(), &status);
961
962         // The i+j stuff is to confuse the optimizer and get it to actually leave the
963         //   call to strstr in place.
964         //pm = strstr(longishText+j, cPattern);
965         //j = (j + i)%5;
966    }
967
968    //printf("%ld, %d\n", pm-longishText, j);
969}
970
971//----------------------------------------------------------------------------------------
972//
973//   Random Numbers.  Similar to standard lib rand() and srand()
974//                    Not using library to
975//                      1.  Get same results on all platforms.
976//                      2.  Get access to current seed, to more easily reproduce failures.
977//
978//---------------------------------------------------------------------------------------
979static uint32_t m_seed = 1;
980
981static uint32_t m_rand()
982{
983    m_seed = m_seed * 1103515245 + 12345;
984    return (uint32_t)(m_seed/65536) % 32768;
985}
986
987class Monkey
988{
989public:
990    virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
991
992protected:
993    Monkey();
994    virtual ~Monkey();
995};
996
997Monkey::Monkey()
998{
999    // ook?
1000}
1001
1002Monkey::~Monkey()
1003{
1004    // ook?
1005}
1006
1007class SetMonkey : public Monkey
1008{
1009public:
1010    SetMonkey(const USet *theSet);
1011    ~SetMonkey();
1012
1013    virtual void append(UnicodeString &test, UnicodeString &alternate);
1014
1015private:
1016    const USet *set;
1017};
1018
1019SetMonkey::SetMonkey(const USet *theSet)
1020    : Monkey(), set(theSet)
1021{
1022    // ook?
1023}
1024
1025SetMonkey::~SetMonkey()
1026{
1027    //ook...
1028}
1029
1030void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1031{
1032    int32_t size = uset_size(set);
1033    int32_t index = m_rand() % size;
1034    UChar32 ch = uset_charAt(set, index);
1035    UnicodeString str(ch);
1036
1037    test.append(str);
1038    alternate.append(str); // flip case, or some junk?
1039}
1040
1041class StringSetMonkey : public Monkey
1042{
1043public:
1044    StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1045    ~StringSetMonkey();
1046
1047    void append(UnicodeString &testCase, UnicodeString &alternate);
1048
1049private:
1050    UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1051
1052    const USet *set;
1053    UCollator  *coll;
1054    CollData   *collData;
1055};
1056
1057StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1058: Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1059{
1060    // ook.
1061}
1062
1063StringSetMonkey::~StringSetMonkey()
1064{
1065    // ook?
1066}
1067
1068void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1069{
1070    int32_t itemCount = uset_getItemCount(set), len = 0;
1071    int32_t index = m_rand() % itemCount;
1072    UChar32 rangeStart = 0, rangeEnd = 0;
1073    UChar buffer[16];
1074    UErrorCode err = U_ZERO_ERROR;
1075
1076    len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1077
1078    if (len == 0) {
1079        int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1080        UChar32 ch = rangeStart + offset;
1081        UnicodeString str(ch);
1082
1083        testCase.append(str);
1084        generateAlternative(str, alternate);
1085    } else if (len > 0) {
1086        // should check that len < 16...
1087        UnicodeString str(buffer, len);
1088
1089        testCase.append(str);
1090        generateAlternative(str, alternate);
1091    } else {
1092        // shouldn't happen...
1093    }
1094}
1095
1096UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1097{
1098    // find out shortest string for the longest sequence of ces.
1099    // needs to be refined to use dynamic programming, but will be roughly right
1100    UErrorCode status = U_ZERO_ERROR;
1101    CEList ceList(coll, testCase, status);
1102    UnicodeString alt;
1103    int32_t offset = 0;
1104
1105    if (ceList.size() == 0) {
1106        return alternate.append(testCase);
1107    }
1108
1109    while (offset < ceList.size()) {
1110        int32_t ce = ceList.get(offset);
1111        const StringList *strings = collData->getStringList(ce);
1112
1113        if (strings == NULL) {
1114            return alternate.append(testCase);
1115        }
1116
1117        int32_t stringCount = strings->size();
1118        int32_t tries = 0;
1119
1120        // find random string that generates the same CEList
1121        const CEList *ceList2 = NULL;
1122        const UnicodeString *string = NULL;
1123              UBool matches = FALSE;
1124
1125        do {
1126            int32_t s = m_rand() % stringCount;
1127
1128            if (tries++ > stringCount) {
1129                alternate.append(testCase);
1130                return alternate;
1131            }
1132
1133            string = strings->get(s);
1134            ceList2 = collData->getCEList(string);
1135            matches = ceList.matchesAt(offset, ceList2);
1136
1137            if (! matches) {
1138                collData->freeCEList((CEList *) ceList2);
1139            }
1140        } while (! matches);
1141
1142        alt.append(*string);
1143        offset += ceList2->size();
1144        collData->freeCEList(ceList2);
1145    }
1146
1147    const CEList altCEs(coll, alt, status);
1148
1149    if (ceList.matchesAt(0, &altCEs)) {
1150        return alternate.append(alt);
1151    }
1152
1153    return alternate.append(testCase);
1154}
1155
1156static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1157{
1158    int32_t pieces = (m_rand() % 4) + 1;
1159    UErrorCode status = U_ZERO_ERROR;
1160    UBool matches;
1161
1162    do {
1163        testCase.remove();
1164        alternate.remove();
1165        monkeys[0]->append(testCase, alternate);
1166
1167        for(int32_t piece = 0; piece < pieces; piece += 1) {
1168            int32_t monkey = m_rand() % monkeyCount;
1169
1170            monkeys[monkey]->append(testCase, alternate);
1171        }
1172
1173        const CEList ceTest(coll, testCase, status);
1174        const CEList ceAlt(coll, alternate, status);
1175
1176        matches = ceTest.matchesAt(0, &ceAlt);
1177    } while (! matches);
1178}
1179
1180static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1181{
1182    UErrorCode      status = U_ZERO_ERROR;
1183    OrderList       targetOrders(coll, target, offset);
1184    OrderList       patternOrders(coll, pattern);
1185    int32_t         targetSize  = targetOrders.size() - 1;
1186    int32_t         patternSize = patternOrders.size() - 1;
1187    UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1188                                                  target.getBuffer(), target.length(), &status);
1189
1190    if (patternSize == 0) {
1191        // Searching for an empty pattern always fails
1192        matchStart = matchEnd = -1;
1193        ubrk_close(charBreakIterator);
1194        return FALSE;
1195    }
1196
1197    matchStart = matchEnd = -1;
1198
1199    for(int32_t i = 0; i < targetSize; i += 1) {
1200        if (targetOrders.matchesAt(i, patternOrders)) {
1201            int32_t start    = targetOrders.getLowOffset(i);
1202            int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1203            int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1204
1205            // if the low and high offsets of the first CE in
1206            // the match are the same, it means that the match
1207            // starts in the middle of an expansion - all but
1208            // the first CE of the expansion will have the offset
1209            // of the following character.
1210            if (start == targetOrders.getHighOffset(i)) {
1211                continue;
1212            }
1213
1214            // Make sure match starts on a grapheme boundary
1215            if (! ubrk_isBoundary(charBreakIterator, start)) {
1216                continue;
1217            }
1218
1219            // If the low and high offsets of the CE after the match
1220            // are the same, it means that the match ends in the middle
1221            // of an expansion sequence.
1222            if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1223                targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1224                continue;
1225            }
1226
1227            int32_t mend = maxLimit;
1228
1229            // Find the first grapheme break after the character index
1230            // of the last CE in the match. If it's after character index
1231            // that's after the last CE in the match, use that index
1232            // as the end of the match.
1233            if (minLimit < maxLimit) {
1234                // When the last CE's low index is same with its high index, the CE is likely
1235                // a part of expansion. In this case, the index is located just after the
1236                // character corresponding to the CEs compared above. If the index is right
1237                // at the break boundary, move the position to the next boundary will result
1238                // incorrect match length when there are ignorable characters exist between
1239                // the position and the next character produces CE(s). See ticket#8482.
1240                if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1241                    mend = minLimit;
1242                } else {
1243                    int32_t nba = ubrk_following(charBreakIterator, minLimit);
1244
1245                    if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1246                        mend = nba;
1247                    }
1248                }
1249            }
1250
1251            if (mend > maxLimit) {
1252                continue;
1253            }
1254
1255            if (! ubrk_isBoundary(charBreakIterator, mend)) {
1256                continue;
1257            }
1258
1259            matchStart = start;
1260            matchEnd   = mend;
1261
1262            ubrk_close(charBreakIterator);
1263            return TRUE;
1264        }
1265    }
1266
1267    ubrk_close(charBreakIterator);
1268    return FALSE;
1269}
1270
1271#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1272static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1273    int32_t val = defaultVal;
1274
1275    name.append(" *= *(-?\\d+)");
1276
1277    UErrorCode status = U_ZERO_ERROR;
1278    RegexMatcher m(name, params, 0, status);
1279
1280    if (m.find()) {
1281        // The param exists.  Convert the string to an int.
1282        char valString[100];
1283        int32_t paramLength = m.end(1, status) - m.start(1, status);
1284
1285        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1286            paramLength = (int32_t)(sizeof(valString)-2);
1287        }
1288
1289        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1290        val = uprv_strtol(valString,  NULL, 10);
1291
1292        // Delete this parameter from the params string.
1293        m.reset();
1294        params = m.replaceFirst("", status);
1295    }
1296
1297  //U_ASSERT(U_SUCCESS(status));
1298    if (! U_SUCCESS(status)) {
1299        val = defaultVal;
1300    }
1301
1302    return val;
1303}
1304#endif
1305
1306#if !UCONFIG_NO_COLLATION
1307int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1308                                    const char *name, const char *strength, uint32_t seed)
1309{
1310    UErrorCode status = U_ZERO_ERROR;
1311    int32_t actualStart = -1, actualEnd = -1;
1312  //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1313    int32_t expectedStart = -1, expectedEnd = -1;
1314    int32_t notFoundCount = 0;
1315    LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1316                                                           testCase.getBuffer(), testCase.length(),
1317                                                           coll,
1318                                                           NULL,     // the break iterator
1319                                                           &status));
1320
1321    // **** TODO: find *all* matches, not just first one ****
1322    simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1323
1324    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1325
1326    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1327        errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1328              "    strength=%s seed=%d",
1329              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1330    }
1331
1332    if (expectedStart == -1 && actualStart == -1) {
1333        notFoundCount += 1;
1334    }
1335
1336    // **** TODO: find *all* matches, not just first one ****
1337    simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1338
1339    usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1340
1341    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1342
1343    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1344        errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1345              "    strength=%s seed=%d",
1346              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1347    }
1348
1349    if (expectedStart == -1 && actualStart == -1) {
1350        notFoundCount += 1;
1351    }
1352
1353    return notFoundCount;
1354}
1355#endif
1356
1357void SSearchTest::monkeyTest(char *params)
1358{
1359    // ook!
1360    UErrorCode status = U_ZERO_ERROR;
1361  //UCollator *coll = ucol_open(NULL, &status);
1362    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1363
1364    if (U_FAILURE(status)) {
1365        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1366        return;
1367    }
1368
1369    CollData  *monkeyData = new CollData(coll, status);
1370
1371    USet *expansions   = uset_openEmpty();
1372    USet *contractions = uset_openEmpty();
1373
1374    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1375
1376    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1377    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1378    USet *letters = uset_openPattern(letter_pattern, 39, &status);
1379    SetMonkey letterMonkey(letters);
1380    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1381    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1382    UnicodeString testCase;
1383    UnicodeString alternate;
1384    UnicodeString pattern, altPattern;
1385    UnicodeString prefix, altPrefix;
1386    UnicodeString suffix, altSuffix;
1387
1388    Monkey *monkeys[] = {
1389        &letterMonkey,
1390        &contractionMonkey,
1391        &expansionMonkey,
1392        &contractionMonkey,
1393        &expansionMonkey,
1394        &contractionMonkey,
1395        &expansionMonkey,
1396        &contractionMonkey,
1397        &expansionMonkey};
1398    int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
1399    // int32_t nonMatchCount = 0;
1400
1401    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1402    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1403    int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
1404    int32_t loopCount = quick? 1000 : 10000;
1405    int32_t firstStrength = 0;
1406    int32_t lastStrength  = strengthCount - 1; //*/ 0;
1407
1408    if (params != NULL) {
1409#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1410        UnicodeString p(params);
1411
1412        loopCount = getIntParam("loop", p, loopCount);
1413        m_seed    = getIntParam("seed", p, m_seed);
1414
1415        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1416        if (m.find()) {
1417            UnicodeString breakType = m.group(1, status);
1418
1419            for (int32_t s = 0; s < strengthCount; s += 1) {
1420                if (breakType == strengthNames[s]) {
1421                    firstStrength = lastStrength = s;
1422                    break;
1423                }
1424            }
1425
1426            m.reset();
1427            p = m.replaceFirst("", status);
1428        }
1429
1430        if (RegexMatcher("\\S", p, 0, status).find()) {
1431            // Each option is stripped out of the option string as it is processed.
1432            // All options have been checked.  The option string should have been completely emptied..
1433            char buf[100];
1434            p.extract(buf, sizeof(buf), NULL, status);
1435            buf[sizeof(buf)-1] = 0;
1436            errln("Unrecognized or extra parameter:  %s\n", buf);
1437            return;
1438        }
1439#else
1440        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1441#endif
1442    }
1443
1444    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1445        int32_t notFoundCount = 0;
1446
1447        logln("Setting strength to %s.", strengthNames[s]);
1448        ucol_setStrength(coll, strengths[s]);
1449
1450        // TODO: try alternate prefix and suffix too?
1451        // TODO: alterntaes are only equal at primary strength. Is this OK?
1452        for(int32_t t = 0; t < loopCount; t += 1) {
1453            uint32_t seed = m_seed;
1454            // int32_t  nmc = 0;
1455
1456            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1457            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1458            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1459
1460            // pattern
1461            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1462
1463            testCase.remove();
1464            testCase.append(prefix);
1465            testCase.append(/*alt*/pattern);
1466
1467            // prefix + pattern
1468            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1469
1470            testCase.append(suffix);
1471
1472            // prefix + pattern + suffix
1473            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1474
1475            testCase.remove();
1476            testCase.append(pattern);
1477            testCase.append(suffix);
1478
1479            // pattern + suffix
1480            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1481        }
1482
1483       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1484    }
1485
1486    uset_close(contractions);
1487    uset_close(expansions);
1488    uset_close(letters);
1489    delete monkeyData;
1490
1491    ucol_close(coll);
1492}
1493
1494#endif
1495
1496#endif
1497