1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
36#include <string.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
41
42#define TEST_ASSERT(x) {if (!(x)) { \
43    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47
48
49//---------------------------------------------
50// runIndexedTest
51//---------------------------------------------
52
53
54//  Note:  Before adding new tests to this file, check whether the desired test data can
55//         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
56//         it's much less work than writing a new test, diagnostic output in the event of failures
57//         is good, and the test data file will is shared with ICU4J, so eventually the test
58//         will run there as well, without additional effort.
59
60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61{
62    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64    switch (index) {
65#if !UCONFIG_NO_FILE_IO
66        case 0: name = "TestBug4153072";
67            if(exec) TestBug4153072();                         break;
68#else
69        case 0: name = "skip";
70            break;
71#endif
72
73        case 1: name = "skip";
74            break;
75        case 2: name = "TestStatusReturn";
76            if(exec) TestStatusReturn();                       break;
77
78#if !UCONFIG_NO_FILE_IO
79        case 3: name = "TestUnicodeFiles";
80            if(exec) TestUnicodeFiles();                       break;
81        case 4: name = "TestEmptyString";
82            if(exec) TestEmptyString();                        break;
83#else
84        case 3: case 4: name = "skip";
85            break;
86#endif
87
88        case 5: name = "TestGetAvailableLocales";
89            if(exec) TestGetAvailableLocales();                break;
90
91        case 6: name = "TestGetDisplayName";
92            if(exec) TestGetDisplayName();                     break;
93
94#if !UCONFIG_NO_FILE_IO
95        case 7: name = "TestEndBehaviour";
96            if(exec) TestEndBehaviour();                       break;
97        case 8: case 9: case 10: name = "skip";
98             break;
99        case 11: name = "TestWordBreaks";
100             if(exec) TestWordBreaks();                        break;
101        case 12: name = "TestWordBoundary";
102             if(exec) TestWordBoundary();                      break;
103        case 13: name = "TestLineBreaks";
104             if(exec) TestLineBreaks();                        break;
105        case 14: name = "TestSentBreaks";
106             if(exec) TestSentBreaks();                        break;
107        case 15: name = "TestExtended";
108             if(exec) TestExtended();                          break;
109#else
110        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111             break;
112#endif
113
114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115        case 16:
116            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
117#else
118        case 16:
119             name = "skip";                                    break;
120#endif
121
122#if !UCONFIG_NO_FILE_IO
123        case 17: name = "TestBug3818";
124            if(exec) TestBug3818();                            break;
125#else
126        case 17: name = "skip";
127            break;
128#endif
129
130        case 18: name = "skip";
131            break;
132        case 19: name = "TestDebug";
133            if(exec) TestDebug();                              break;
134        case 20: name = "skip";
135            break;
136
137#if !UCONFIG_NO_FILE_IO
138        case 21: name = "TestBug5775";
139            if (exec) TestBug5775();                           break;
140#else
141        case 21: name = "skip";
142            break;
143#endif
144
145        case 22: name = "TestBug9983";
146            if (exec) TestBug9983();                           break;
147        case 23: name = "TestDictRules";
148            if (exec) TestDictRules();                         break;
149        case 24: name = "TestBug5532";
150            if (exec) TestBug5532();                           break;
151        default: name = ""; break; //needed to end loop
152    }
153}
154
155
156//---------------------------------------------------------------------------
157//
158//   class BITestData   Holds a set of Break iterator test data and results
159//                      Includes
160//                         - the string data to be broken
161//                         - a vector of the expected break positions.
162//                         - a vector of source line numbers for the data,
163//                               (to help see where errors occured.)
164//                         - The expected break tag values.
165//                         - Vectors of actual break positions and tag values.
166//                         - Functions for comparing actual with expected and
167//                            reporting errors.
168//
169//----------------------------------------------------------------------------
170class BITestData {
171public:
172    UnicodeString    fDataToBreak;
173    UVector          fExpectedBreakPositions;
174    UVector          fExpectedTags;
175    UVector          fLineNum;
176    UVector          fActualBreakPositions;   // Test Results.
177    UVector          fActualTags;
178
179    BITestData(UErrorCode &status);
180    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181    void             checkResults(const char *heading, RBBITest *test);
182    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183    void             clearResults();
184};
185
186//
187// Constructor.
188//
189BITestData::BITestData(UErrorCode &status)
190: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
191  fActualTags(status)
192{
193}
194
195//
196// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
197//                 The macro form collects the line number, which is helpful
198//                 when tracking down failures.
199//
200//                 A null data item is inserted at the start of each test's data
201//                  to put the starting zero into the data list.  The position saved for
202//                  each non-null item is its ending position.
203//
204#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206    if (U_FAILURE(status)) {return;}
207    if (data != NULL) {
208        fDataToBreak.append(CharsToUnicodeString(data));
209    }
210    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211    fExpectedTags.addElement(tag, status);
212    fLineNum.addElement(lineNum, status);
213}
214
215
216//
217//  checkResults.   Compare the actual and expected break positions, report any differences.
218//
219void BITestData::checkResults(const char *heading, RBBITest *test) {
220    int32_t   expectedIndex = 0;
221    int32_t   actualIndex = 0;
222
223    for (;;) {
224        // If we've run through both the expected and actual results vectors, we're done.
225        //   break out of the loop.
226        if (expectedIndex >= fExpectedBreakPositions.size() &&
227            actualIndex   >= fActualBreakPositions.size()) {
228            break;
229        }
230
231
232        if (expectedIndex >= fExpectedBreakPositions.size()) {
233            err(heading, test, expectedIndex-1, actualIndex);
234            actualIndex++;
235            continue;
236        }
237
238        if (actualIndex >= fActualBreakPositions.size()) {
239            err(heading, test, expectedIndex, actualIndex-1);
240            expectedIndex++;
241            continue;
242        }
243
244        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245            err(heading, test, expectedIndex, actualIndex);
246            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248                actualIndex++;
249            } else {
250                expectedIndex++;
251            }
252            continue;
253        }
254
255        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
257                heading, fLineNum.elementAt(expectedIndex),
258                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259        }
260
261        actualIndex++;
262        expectedIndex++;
263    }
264}
265
266//
267//  err   -  An error was found.  Report it, along with information about where the
268//                                incorrectly broken test data appeared in the source file.
269//
270void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271{
272    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
273    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
274    int32_t   o        = 0;
275    int32_t   line     = fLineNum.elementAti(expectedIdx);
276    if (expectedIdx > 0) {
277        // The line numbers are off by one because a premature break occurs somewhere
278        //    within the previous item, rather than at the start of the current (expected) item.
279        //    We want to report the offset of the unexpected break from the start of
280        //      this previous item.
281        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282    }
283    if (actual < expected) {
284        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
285    } else {
286        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
287    }
288}
289
290
291void BITestData::clearResults() {
292    fActualBreakPositions.removeAllElements();
293    fActualTags.removeAllElements();
294}
295
296
297//--------------------------------------------------------------------------------------
298//
299//    RBBITest    constructor and destructor
300//
301//--------------------------------------------------------------------------------------
302
303RBBITest::RBBITest() {
304}
305
306
307RBBITest::~RBBITest() {
308}
309
310//-----------------------------------------------------------------------------------
311//
312//   Test for status {tag} return value from break rules.
313//        TODO:  a more thorough test.
314//
315//-----------------------------------------------------------------------------------
316void RBBITest::TestStatusReturn() {
317     UnicodeString rulesString1("$Letters = [:L:];\n"
318                                  "$Numbers = [:N:];\n"
319                                  "$Letters+{1};\n"
320                                  "$Numbers+{2};\n"
321                                  "Help\\ {4}/me\\!;\n"
322                                  "[^$Letters $Numbers];\n"
323                                  "!.*;\n", -1, US_INV);
324     UnicodeString testString1  = "abc123..abc Help me Help me!";
325                                // 01234567890123456789012345678
326     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
328
329     UErrorCode status=U_ZERO_ERROR;
330     UParseError    parseError;
331
332     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333     if(U_FAILURE(status)) {
334         dataerrln("FAIL : in construction - %s", u_errorName(status));
335     } else {
336         int32_t  pos;
337         int32_t  i = 0;
338         bi->setText(testString1);
339         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340             if (pos != bounds1[i]) {
341                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
342                 break;
343             }
344
345             int tag = bi->getRuleStatus();
346             if (tag != brkStatus[i]) {
347                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348                 break;
349             }
350             i++;
351         }
352     }
353     delete bi;
354}
355
356
357static void printStringBreaks(UnicodeString ustr, int expected[],
358                              int expectedcount)
359{
360    UErrorCode status = U_ZERO_ERROR;
361    char name[100];
362    printf("code    alpha extend alphanum type word sent line name\n");
363    int j;
364    for (j = 0; j < ustr.length(); j ++) {
365        if (expectedcount > 0) {
366            int k;
367            for (k = 0; k < expectedcount; k ++) {
368                if (j == expected[k]) {
369                    printf("------------------------------------------------ %d\n",
370                           j);
371                }
372            }
373        }
374        UChar32 c = ustr.char32At(j);
375        if (c > 0xffff) {
376            j ++;
377        }
378        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380                           u_isUAlphabetic(c),
381                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382                           u_isalnum(c),
383                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384                                                  u_charType(c),
385                                                  U_SHORT_PROPERTY_NAME),
386                           u_getPropertyValueName(UCHAR_WORD_BREAK,
387                                                  u_getIntPropertyValue(c,
388                                                          UCHAR_WORD_BREAK),
389                                                  U_SHORT_PROPERTY_NAME),
390                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391                                   u_getIntPropertyValue(c,
392                                           UCHAR_SENTENCE_BREAK),
393                                   U_SHORT_PROPERTY_NAME),
394                           u_getPropertyValueName(UCHAR_LINE_BREAK,
395                                   u_getIntPropertyValue(c,
396                                           UCHAR_LINE_BREAK),
397                                   U_SHORT_PROPERTY_NAME),
398                           name);
399    }
400}
401
402
403void RBBITest::TestBug3818() {
404    UErrorCode  status = U_ZERO_ERROR;
405
406    // Four Thai words...
407    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409    UnicodeString  thaiStr(thaiWordData);
410
411    RuleBasedBreakIterator* bi =
412        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
413    if (U_FAILURE(status) || bi == NULL) {
414        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
415        return;
416    }
417    bi->setText(thaiStr);
418
419    int32_t  startOfSecondWord = bi->following(1);
420    if (startOfSecondWord != 4) {
421        errln("Fail at file %s, line %d expected start of word at 4, got %d",
422            __FILE__, __LINE__, startOfSecondWord);
423    }
424    startOfSecondWord = bi->following(0);
425    if (startOfSecondWord != 4) {
426        errln("Fail at file %s, line %d expected start of word at 4, got %d",
427            __FILE__, __LINE__, startOfSecondWord);
428    }
429    delete bi;
430}
431
432//----------------------------------------------------------------------------
433//
434// generalIteratorTest      Given a break iterator and a set of test data,
435//                          Run the tests and report the results.
436//
437//----------------------------------------------------------------------------
438void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
439{
440
441    bi.setText(td.fDataToBreak);
442
443    testFirstAndNext(bi, td);
444
445    testLastAndPrevious(bi, td);
446
447    testFollowing(bi, td);
448    testPreceding(bi, td);
449    testIsBoundary(bi, td);
450    doMultipleSelectionTest(bi, td);
451}
452
453
454//
455//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
456//                       kind of loop.
457//
458void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
459{
460    UErrorCode  status = U_ZERO_ERROR;
461    int32_t     p;
462    int32_t     lastP = -1;
463    int32_t     tag;
464
465    logln("Test first and next");
466    bi.setText(td.fDataToBreak);
467    td.clearResults();
468
469    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
470        td.fActualBreakPositions.addElement(p, status);  // Save result.
471        tag = bi.getRuleStatus();
472        td.fActualTags.addElement(tag, status);
473        if (p <= lastP) {
474            // If the iterator is not making forward progress, stop.
475            //  No need to raise an error here, it'll be detected in the normal check of results.
476            break;
477        }
478        lastP = p;
479    }
480    td.checkResults("testFirstAndNext", this);
481}
482
483
484//
485//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
486//
487void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
488{
489    UErrorCode  status = U_ZERO_ERROR;
490    int32_t     p;
491    int32_t     lastP  = 0x7ffffffe;
492    int32_t     tag;
493
494    logln("Test last and previous");
495    bi.setText(td.fDataToBreak);
496    td.clearResults();
497
498    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
499        // Save break position.  Insert it at start of vector of results, shoving
500        //    already-saved results further towards the end.
501        td.fActualBreakPositions.insertElementAt(p, 0, status);
502        // bi.previous();   // TODO:  Why does this fix things up????
503        // bi.next();
504        tag = bi.getRuleStatus();
505        td.fActualTags.insertElementAt(tag, 0, status);
506        if (p >= lastP) {
507            // If the iterator is not making progress, stop.
508            //  No need to raise an error here, it'll be detected in the normal check of results.
509            break;
510        }
511        lastP = p;
512    }
513    td.checkResults("testLastAndPrevious", this);
514}
515
516
517void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
518{
519    UErrorCode  status = U_ZERO_ERROR;
520    int32_t     p;
521    int32_t     tag;
522    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
523                                 //   cannot be -1; that is returned for DONE.
524    int         i;
525
526    logln("testFollowing():");
527    bi.setText(td.fDataToBreak);
528    td.clearResults();
529
530    // Save the starting point, since we won't get that out of following.
531    p = bi.first();
532    td.fActualBreakPositions.addElement(p, status);  // Save result.
533    tag = bi.getRuleStatus();
534    td.fActualTags.addElement(tag, status);
535
536    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
537        p = bi.following(i);
538        if (p != lastP) {
539            if (p == RuleBasedBreakIterator::DONE) {
540                break;
541            }
542            // We've reached a new break position.  Save it.
543            td.fActualBreakPositions.addElement(p, status);  // Save result.
544            tag = bi.getRuleStatus();
545            td.fActualTags.addElement(tag, status);
546            lastP = p;
547        }
548    }
549    // The loop normally exits by means of the break in the middle.
550    // Make sure that the index was at the correct position for the break iterator to have
551    //   returned DONE.
552    if (i != td.fDataToBreak.length()) {
553        errln("testFollowing():  iterator returned DONE prematurely.");
554    }
555
556    // Full check of all results.
557    td.checkResults("testFollowing", this);
558}
559
560
561
562void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
563    UErrorCode  status = U_ZERO_ERROR;
564    int32_t     p;
565    int32_t     tag;
566    int32_t     lastP  = 0x7ffffffe;
567    int         i;
568
569    logln("testPreceding():");
570    bi.setText(td.fDataToBreak);
571    td.clearResults();
572
573    p = bi.last();
574    td.fActualBreakPositions.addElement(p, status);
575    tag = bi.getRuleStatus();
576    td.fActualTags.addElement(tag, status);
577
578    for (i = td.fDataToBreak.length(); i>=-1; i--) {
579        p = bi.preceding(i);
580        if (p != lastP) {
581            if (p == RuleBasedBreakIterator::DONE) {
582                break;
583            }
584            // We've reached a new break position.  Save it.
585            td.fActualBreakPositions.insertElementAt(p, 0, status);
586            lastP = p;
587            tag = bi.getRuleStatus();
588            td.fActualTags.insertElementAt(tag, 0, status);
589        }
590    }
591    // The loop normally exits by means of the break in the middle.
592    // Make sure that the index was at the correct position for the break iterator to have
593    //   returned DONE.
594    if (i != 0) {
595        errln("testPreceding():  iterator returned DONE prematurely.");
596    }
597
598    // Full check of all results.
599    td.checkResults("testPreceding", this);
600}
601
602
603
604void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
605    UErrorCode  status = U_ZERO_ERROR;
606    int         i;
607    int32_t     tag;
608
609    logln("testIsBoundary():");
610    bi.setText(td.fDataToBreak);
611    td.clearResults();
612
613    for (i = 0; i <= td.fDataToBreak.length(); i++) {
614        if (bi.isBoundary(i)) {
615            td.fActualBreakPositions.addElement(i, status);  // Save result.
616            tag = bi.getRuleStatus();
617            td.fActualTags.addElement(tag, status);
618        }
619    }
620    td.checkResults("testIsBoundary: ", this);
621}
622
623
624
625void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
626{
627    iterator.setText(td.fDataToBreak);
628
629    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
630    int32_t offset = iterator.first();
631    int32_t testOffset;
632    int32_t count = 0;
633
634    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
635
636    if (*testIterator != iterator)
637        errln("clone() or operator!= failed: two clones compared unequal");
638
639    do {
640        testOffset = testIterator->first();
641        testOffset = testIterator->next(count);
642        if (offset != testOffset)
643            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
644
645        if (offset != RuleBasedBreakIterator::DONE) {
646            count++;
647            offset = iterator.next();
648
649            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
650                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
651                if (count > 10000 || offset == -1) {
652                    errln("operator== failed too many times. Stopping test.");
653                    if (offset == -1) {
654                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
655                    }
656                    return;
657                }
658            }
659        }
660    } while (offset != RuleBasedBreakIterator::DONE);
661
662    // now do it backwards...
663    offset = iterator.last();
664    count = 0;
665
666    do {
667        testOffset = testIterator->last();
668        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
669        if (offset != testOffset)
670            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
671
672        if (offset != RuleBasedBreakIterator::DONE) {
673            count--;
674            offset = iterator.previous();
675        }
676    } while (offset != RuleBasedBreakIterator::DONE);
677
678    delete testIterator;
679}
680
681
682//---------------------------------------------
683//
684//     other tests
685//
686//---------------------------------------------
687void RBBITest::TestEmptyString()
688{
689    UnicodeString text = "";
690    UErrorCode status = U_ZERO_ERROR;
691
692    BITestData x(status);
693    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
694    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
695    if (U_FAILURE(status))
696    {
697        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
698        return;
699    }
700    generalIteratorTest(*bi, x);
701    delete bi;
702}
703
704void RBBITest::TestGetAvailableLocales()
705{
706    int32_t locCount = 0;
707    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
708
709    if (locCount == 0)
710        dataerrln("getAvailableLocales() returned an empty list!");
711    // Just make sure that it's returning good memory.
712    int32_t i;
713    for (i = 0; i < locCount; ++i) {
714        logln(locList[i].getName());
715    }
716}
717
718//Testing the BreakIterator::getDisplayName() function
719void RBBITest::TestGetDisplayName()
720{
721    UnicodeString   result;
722
723    BreakIterator::getDisplayName(Locale::getUS(), result);
724    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
725        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
726                + result);
727
728    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
729    if (result != "French (France)")
730        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
731                + result);
732}
733/**
734 * Test End Behaviour
735 * @bug 4068137
736 */
737void RBBITest::TestEndBehaviour()
738{
739    UErrorCode status = U_ZERO_ERROR;
740    UnicodeString testString("boo.");
741    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
742    if (U_FAILURE(status))
743    {
744        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
745        return;
746    }
747    wb->setText(testString);
748
749    if (wb->first() != 0)
750        errln("Didn't get break at beginning of string.");
751    if (wb->next() != 3)
752        errln("Didn't get break before period in \"boo.\"");
753    if (wb->current() != 4 && wb->next() != 4)
754        errln("Didn't get break at end of string.");
755    delete wb;
756}
757/*
758 * @bug 4153072
759 */
760void RBBITest::TestBug4153072() {
761    UErrorCode status = U_ZERO_ERROR;
762    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
763    if (U_FAILURE(status))
764    {
765        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
766        return;
767    }
768    UnicodeString str("...Hello, World!...");
769    int32_t begin = 3;
770    int32_t end = str.length() - 3;
771    UBool onBoundary;
772
773    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
774    iter->adoptText(textIterator);
775    int index;
776    // Note: with the switch to UText, there is no way to restrict the
777    //       iteration range to begin at an index other than zero.
778    //       String character iterators created with a non-zero bound are
779    //         treated by RBBI as being empty.
780    for (index = -1; index < begin + 1; ++index) {
781        onBoundary = iter->isBoundary(index);
782        if (index == 0?  !onBoundary : onBoundary) {
783            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
784                            " and begin index = " + begin);
785        }
786    }
787    delete iter;
788}
789
790
791//
792// Test for problem reported by Ashok Matoria on 9 July 2007
793//    One.<kSoftHyphen><kSpace>Two.
794//
795//    Sentence break at start (0) and then on calling next() it breaks at
796//   'T' of "Two". Now, at this point if I do next() and
797//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
798//
799void RBBITest::TestBug5775() {
800    UErrorCode status = U_ZERO_ERROR;
801    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
802    TEST_ASSERT_SUCCESS(status);
803    if (U_FAILURE(status)) {
804        return;
805    }
806// Check for status first for better handling of no data errors.
807    TEST_ASSERT(bi != NULL);
808    if (bi == NULL) {
809        return;
810    }
811
812    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
813    //               01234      56789
814    s = s.unescape();
815    bi->setText(s);
816    int pos = bi->next();
817    TEST_ASSERT(pos == 6);
818    pos = bi->next();
819    TEST_ASSERT(pos == 10);
820    pos = bi->previous();
821    TEST_ASSERT(pos == 6);
822    delete bi;
823}
824
825
826
827//------------------------------------------------------------------------------
828//
829//   RBBITest::Extended    Run  RBBI Tests from an external test data file
830//
831//------------------------------------------------------------------------------
832
833struct TestParams {
834    BreakIterator   *bi;
835    UnicodeString    dataToBreak;
836    UVector32       *expectedBreaks;
837    UVector32       *srcLine;
838    UVector32       *srcCol;
839};
840
841void RBBITest::executeTest(TestParams *t) {
842    int32_t    bp;
843    int32_t    prevBP;
844    int32_t    i;
845
846    if (t->bi == NULL) {
847        return;
848    }
849
850    t->bi->setText(t->dataToBreak);
851    //
852    //  Run the iterator forward
853    //
854    prevBP = -1;
855    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
856        if (prevBP ==  bp) {
857            // Fail for lack of forward progress.
858            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
859                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
860            break;
861        }
862
863        // Check that there were we didn't miss an expected break between the last one
864        //  and this one.
865        for (i=prevBP+1; i<bp; i++) {
866            if (t->expectedBreaks->elementAti(i) != 0) {
867                int expected[] = {0, i};
868                printStringBreaks(t->dataToBreak, expected, 2);
869                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
870                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
871            }
872        }
873
874        // Check that the break we did find was expected
875        if (t->expectedBreaks->elementAti(bp) == 0) {
876            int expected[] = {0, bp};
877            printStringBreaks(t->dataToBreak, expected, 2);
878            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
879                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
880        } else {
881            // The break was expected.
882            //   Check that the {nnn} tag value is correct.
883            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
884            if (expectedTagVal == -1) {
885                expectedTagVal = 0;
886            }
887            int32_t line = t->srcLine->elementAti(bp);
888            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
889            if (rs != expectedTagVal) {
890                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
891                      "          Actual, Expected status = %4d, %4d",
892                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
893            }
894        }
895
896
897        prevBP = bp;
898    }
899
900    // Verify that there were no missed expected breaks after the last one found
901    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
902        if (t->expectedBreaks->elementAti(i) != 0) {
903            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
904                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
905        }
906    }
907
908    //
909    //  Run the iterator backwards, verify that the same breaks are found.
910    //
911    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
912    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
913        if (prevBP ==  bp) {
914            // Fail for lack of progress.
915            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
916                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
917            break;
918        }
919
920        // Check that there were we didn't miss an expected break between the last one
921        //  and this one.  (UVector returns zeros for index out of bounds.)
922        for (i=prevBP-1; i>bp; i--) {
923            if (t->expectedBreaks->elementAti(i) != 0) {
924                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
925                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
926            }
927        }
928
929        // Check that the break we did find was expected
930        if (t->expectedBreaks->elementAti(bp) == 0) {
931            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
932                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
933        } else {
934            // The break was expected.
935            //   Check that the {nnn} tag value is correct.
936            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
937            if (expectedTagVal == -1) {
938                expectedTagVal = 0;
939            }
940            int line = t->srcLine->elementAti(bp);
941            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
942            if (rs != expectedTagVal) {
943                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
944                      "          Actual, Expected status = %4d, %4d",
945                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
946            }
947        }
948
949        prevBP = bp;
950    }
951
952    // Verify that there were no missed breaks prior to the last one found
953    for (i=prevBP-1; i>=0; i--) {
954        if (t->expectedBreaks->elementAti(i) != 0) {
955            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
956                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
957        }
958    }
959
960    // Check isBoundary()
961    for (i=0; i<t->expectedBreaks->size(); i++) {
962        UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
963        UBool boundaryFound    = t->bi->isBoundary(i);
964        if (boundaryExpected != boundaryFound) {
965            errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
966                  "        Expected, Actual= %s, %s",
967                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
968                  boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
969        }
970    }
971
972    // Check following()
973    for (i=0; i<t->expectedBreaks->size(); i++) {
974        int32_t actualBreak = t->bi->following(i);
975        int32_t expectedBreak = BreakIterator::DONE;
976        for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
977            if (t->expectedBreaks->elementAti(j) != 0) {
978                expectedBreak = j;
979                break;
980            }
981        }
982        if (expectedBreak != actualBreak) {
983            errln("following(%d) incorrect. File line,col= %4d,%4d\n"
984                  "        Expected, Actual= %d, %d",
985                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
986        }
987    }
988
989    // Check preceding()
990    for (i=t->expectedBreaks->size(); i>=0; i--) {
991        int32_t actualBreak = t->bi->preceding(i);
992        int32_t expectedBreak = BreakIterator::DONE;
993
994        for (int32_t j=i-1; j >= 0; j--) {
995            if (t->expectedBreaks->elementAti(j) != 0) {
996                expectedBreak = j;
997                break;
998            }
999        }
1000        if (expectedBreak != actualBreak) {
1001            errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1002                  "        Expected, Actual= %d, %d",
1003                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1004        }
1005    }
1006}
1007
1008
1009void RBBITest::TestExtended() {
1010#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1011    UErrorCode      status  = U_ZERO_ERROR;
1012    Locale          locale("");
1013
1014    UnicodeString       rules;
1015    TestParams          tp;
1016    tp.bi             = NULL;
1017    tp.expectedBreaks = new UVector32(status);
1018    tp.srcLine        = new UVector32(status);
1019    tp.srcCol         = new UVector32(status);
1020
1021    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1022    if (U_FAILURE(status)) {
1023        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1024    }
1025
1026
1027    //
1028    //  Open and read the test data file.
1029    //
1030    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1031    char testFileName[1000];
1032    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1033        errln("Can't open test data.  Path too long.");
1034        return;
1035    }
1036    strcpy(testFileName, testDataDirectory);
1037    strcat(testFileName, "rbbitst.txt");
1038
1039    int    len;
1040    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1041    if (U_FAILURE(status)) {
1042        return; /* something went wrong, error already output */
1043    }
1044
1045
1046
1047
1048    //
1049    //  Put the test data into a UnicodeString
1050    //
1051    UnicodeString testString(FALSE, testFile, len);
1052
1053    enum EParseState{
1054        PARSE_COMMENT,
1055        PARSE_TAG,
1056        PARSE_DATA,
1057        PARSE_NUM
1058    }
1059    parseState = PARSE_TAG;
1060
1061    EParseState savedState = PARSE_TAG;
1062
1063    static const UChar CH_LF        = 0x0a;
1064    static const UChar CH_CR        = 0x0d;
1065    static const UChar CH_HASH      = 0x23;
1066    /*static const UChar CH_PERIOD    = 0x2e;*/
1067    static const UChar CH_LT        = 0x3c;
1068    static const UChar CH_GT        = 0x3e;
1069    static const UChar CH_BACKSLASH = 0x5c;
1070    static const UChar CH_BULLET    = 0x2022;
1071
1072    int32_t    lineNum  = 1;
1073    int32_t    colStart = 0;
1074    int32_t    column   = 0;
1075    int32_t    charIdx  = 0;
1076
1077    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1078
1079    for (charIdx = 0; charIdx < len; ) {
1080        status = U_ZERO_ERROR;
1081        UChar  c = testString.charAt(charIdx);
1082        charIdx++;
1083        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1084            // treat CRLF as a unit
1085            c = CH_LF;
1086            charIdx++;
1087        }
1088        if (c == CH_LF || c == CH_CR) {
1089            lineNum++;
1090            colStart = charIdx;
1091        }
1092        column = charIdx - colStart + 1;
1093
1094        switch (parseState) {
1095        case PARSE_COMMENT:
1096            if (c == 0x0a || c == 0x0d) {
1097                parseState = savedState;
1098            }
1099            break;
1100
1101        case PARSE_TAG:
1102            {
1103            if (c == CH_HASH) {
1104                parseState = PARSE_COMMENT;
1105                savedState = PARSE_TAG;
1106                break;
1107            }
1108            if (u_isUWhiteSpace(c)) {
1109                break;
1110            }
1111            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1112                delete tp.bi;
1113                tp.bi = BreakIterator::createWordInstance(locale,  status);
1114                charIdx += 5;
1115                break;
1116            }
1117            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1118                delete tp.bi;
1119                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1120                charIdx += 5;
1121                break;
1122            }
1123            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1124                delete tp.bi;
1125                tp.bi = BreakIterator::createLineInstance(locale,  status);
1126                charIdx += 5;
1127                break;
1128            }
1129            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1130                delete tp.bi;
1131                tp.bi = NULL;
1132                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1133                charIdx += 5;
1134                break;
1135            }
1136            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1137                delete tp.bi;
1138                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1139                charIdx += 6;
1140                break;
1141            }
1142
1143            // <locale  loc_name>
1144            localeMatcher.reset(testString);
1145            if (localeMatcher.lookingAt(charIdx-1, status)) {
1146                UnicodeString localeName = localeMatcher.group(1, status);
1147                char localeName8[100];
1148                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1149                locale = Locale::createFromName(localeName8);
1150                charIdx += localeMatcher.group(0, status).length() - 1;
1151                TEST_ASSERT_SUCCESS(status);
1152                break;
1153            }
1154            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1155                parseState = PARSE_DATA;
1156                charIdx += 5;
1157                tp.dataToBreak = "";
1158                tp.expectedBreaks->removeAllElements();
1159                tp.srcCol ->removeAllElements();
1160                tp.srcLine->removeAllElements();
1161                break;
1162            }
1163
1164            errln("line %d: Tag expected in test file.", lineNum);
1165            parseState = PARSE_COMMENT;
1166            savedState = PARSE_DATA;
1167            goto end_test; // Stop the test.
1168            }
1169            break;
1170
1171        case PARSE_DATA:
1172            if (c == CH_BULLET) {
1173                int32_t  breakIdx = tp.dataToBreak.length();
1174                tp.expectedBreaks->setSize(breakIdx+1);
1175                tp.expectedBreaks->setElementAt(-1, breakIdx);
1176                tp.srcLine->setSize(breakIdx+1);
1177                tp.srcLine->setElementAt(lineNum, breakIdx);
1178                tp.srcCol ->setSize(breakIdx+1);
1179                tp.srcCol ->setElementAt(column, breakIdx);
1180                break;
1181            }
1182
1183            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1184                // Add final entry to mappings from break location to source file position.
1185                //  Need one extra because last break position returned is after the
1186                //    last char in the data, not at the last char.
1187                tp.srcLine->addElement(lineNum, status);
1188                tp.srcCol ->addElement(column, status);
1189
1190                parseState = PARSE_TAG;
1191                charIdx += 6;
1192
1193                // RUN THE TEST!
1194                executeTest(&tp);
1195                break;
1196            }
1197
1198            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1199                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1200                // Get the code point from the name and insert it into the test data.
1201                //   (Damn, no API takes names in Unicode  !!!
1202                //    we've got to take it back to char *)
1203                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1204                int32_t nameLength = nameEndIdx - (charIdx+2);
1205                char charNameBuf[200];
1206                UChar32 theChar = -1;
1207                if (nameEndIdx != -1) {
1208                    UErrorCode status = U_ZERO_ERROR;
1209                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1210                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1211                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1212                    if (U_FAILURE(status)) {
1213                        theChar = -1;
1214                    }
1215                }
1216                if (theChar == -1) {
1217                    errln("Error in named character in test file at line %d, col %d",
1218                        lineNum, column);
1219                } else {
1220                    // Named code point was recognized.  Insert it
1221                    //   into the test data.
1222                    tp.dataToBreak.append(theChar);
1223                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1224                        tp.srcLine->addElement(lineNum, status);
1225                        tp.srcCol ->addElement(column, status);
1226                    }
1227                }
1228                if (nameEndIdx > charIdx) {
1229                    charIdx = nameEndIdx+1;
1230
1231                }
1232                break;
1233            }
1234
1235
1236
1237
1238            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1239                charIdx++;
1240                int32_t  breakIdx = tp.dataToBreak.length();
1241                tp.expectedBreaks->setSize(breakIdx+1);
1242                tp.expectedBreaks->setElementAt(-1, breakIdx);
1243                tp.srcLine->setSize(breakIdx+1);
1244                tp.srcLine->setElementAt(lineNum, breakIdx);
1245                tp.srcCol ->setSize(breakIdx+1);
1246                tp.srcCol ->setElementAt(column, breakIdx);
1247                break;
1248            }
1249
1250            if (c == CH_LT) {
1251                tagValue   = 0;
1252                parseState = PARSE_NUM;
1253                break;
1254            }
1255
1256            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1257                parseState = PARSE_COMMENT;
1258                savedState = PARSE_DATA;
1259                break;
1260            }
1261
1262            if (c == CH_BACKSLASH) {
1263                // Check for \ at end of line, a line continuation.
1264                //     Advance over (discard) the newline
1265                UChar32 cp = testString.char32At(charIdx);
1266                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1267                    // We have a CR LF
1268                    //  Need an extra increment of the input ptr to move over both of them
1269                    charIdx++;
1270                }
1271                if (cp == CH_LF || cp == CH_CR) {
1272                    lineNum++;
1273                    colStart = charIdx;
1274                    charIdx++;
1275                    break;
1276                }
1277
1278                // Let unescape handle the back slash.
1279                cp = testString.unescapeAt(charIdx);
1280                if (cp != -1) {
1281                    // Escape sequence was recognized.  Insert the char
1282                    //   into the test data.
1283                    tp.dataToBreak.append(cp);
1284                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1285                        tp.srcLine->addElement(lineNum, status);
1286                        tp.srcCol ->addElement(column, status);
1287                    }
1288                    break;
1289                }
1290
1291
1292                // Not a recognized backslash escape sequence.
1293                // Take the next char as a literal.
1294                //  TODO:  Should this be an error?
1295                c = testString.charAt(charIdx);
1296                charIdx = testString.moveIndex32(charIdx, 1);
1297            }
1298
1299            // Normal, non-escaped data char.
1300            tp.dataToBreak.append(c);
1301
1302            // Save the mapping from offset in the data to line/column numbers in
1303            //   the original input file.  Will be used for better error messages only.
1304            //   If there's an expected break before this char, the slot in the mapping
1305            //     vector will already be set for this char; don't overwrite it.
1306            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1307                tp.srcLine->addElement(lineNum, status);
1308                tp.srcCol ->addElement(column, status);
1309            }
1310            break;
1311
1312
1313        case PARSE_NUM:
1314            // We are parsing an expected numeric tag value, like <1234>,
1315            //   within a chunk of data.
1316            if (u_isUWhiteSpace(c)) {
1317                break;
1318            }
1319
1320            if (c == CH_GT) {
1321                // Finished the number.  Add the info to the expected break data,
1322                //   and switch parse state back to doing plain data.
1323                parseState = PARSE_DATA;
1324                if (tagValue == 0) {
1325                    tagValue = -1;
1326                }
1327                int32_t  breakIdx = tp.dataToBreak.length();
1328                tp.expectedBreaks->setSize(breakIdx+1);
1329                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1330                tp.srcLine->setSize(breakIdx+1);
1331                tp.srcLine->setElementAt(lineNum, breakIdx);
1332                tp.srcCol ->setSize(breakIdx+1);
1333                tp.srcCol ->setElementAt(column, breakIdx);
1334                break;
1335            }
1336
1337            if (u_isdigit(c)) {
1338                tagValue = tagValue*10 + u_charDigitValue(c);
1339                break;
1340            }
1341
1342            errln("Syntax Error in test file at line %d, col %d",
1343                lineNum, column);
1344            parseState = PARSE_COMMENT;
1345            goto end_test; // Stop the test
1346            break;
1347        }
1348
1349
1350        if (U_FAILURE(status)) {
1351            dataerrln("ICU Error %s while parsing test file at line %d.",
1352                u_errorName(status), lineNum);
1353            status = U_ZERO_ERROR;
1354            goto end_test; // Stop the test
1355        }
1356
1357    }
1358
1359end_test:
1360    delete tp.bi;
1361    delete tp.expectedBreaks;
1362    delete tp.srcLine;
1363    delete tp.srcCol;
1364    delete [] testFile;
1365#endif
1366}
1367
1368
1369//-------------------------------------------------------------------------------
1370//
1371//  TestDictRules   create a break iterator from source rules that includes a
1372//                  dictionary range.   Regression for bug #7130.  Source rules
1373//                  do not declare a break iterator type (word, line, sentence, etc.
1374//                  but the dictionary code, without a type, would loop.
1375//
1376//-------------------------------------------------------------------------------
1377void RBBITest::TestDictRules() {
1378    const char *rules =  "$dictionary = [a-z]; \n"
1379                         "!!forward; \n"
1380                         "$dictionary $dictionary; \n"
1381                         "!!reverse; \n"
1382                         "$dictionary $dictionary; \n";
1383    const char *text = "aa";
1384    UErrorCode status = U_ZERO_ERROR;
1385    UParseError parseError;
1386
1387    RuleBasedBreakIterator bi(rules, parseError, status);
1388    if (U_SUCCESS(status)) {
1389        UnicodeString utext = text;
1390        bi.setText(utext);
1391        int32_t position;
1392        int32_t loops;
1393        for (loops = 0; loops<10; loops++) {
1394            position = bi.next();
1395            if (position == RuleBasedBreakIterator::DONE) {
1396                break;
1397            }
1398        }
1399        TEST_ASSERT(loops == 1);
1400    } else {
1401        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1402    }
1403}
1404
1405
1406
1407//-------------------------------------------------------------------------------
1408//
1409//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1410//    return the datain one big UChar * buffer, which the caller must delete.
1411//
1412//    parameters:
1413//          fileName:   the name of the file, with no directory part.  The test data directory
1414//                      is assumed.
1415//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1416//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1417//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1418//                      Pass NULL for the system default encoding.
1419//          status
1420//    returns:
1421//                      The file data, converted to UChar.
1422//                      The caller must delete this when done with
1423//                           delete [] theBuffer;
1424//
1425//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1426//           Move this function to some common place.
1427//
1428//--------------------------------------------------------------------------------
1429UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1430    UChar       *retPtr  = NULL;
1431    char        *fileBuf = NULL;
1432    UConverter* conv     = NULL;
1433    FILE        *f       = NULL;
1434
1435    ulen = 0;
1436    if (U_FAILURE(status)) {
1437        return retPtr;
1438    }
1439
1440    //
1441    //  Open the file.
1442    //
1443    f = fopen(fileName, "rb");
1444    if (f == 0) {
1445        dataerrln("Error opening test data file %s\n", fileName);
1446        status = U_FILE_ACCESS_ERROR;
1447        return NULL;
1448    }
1449    //
1450    //  Read it in
1451    //
1452    int   fileSize;
1453    int   amt_read;
1454
1455    fseek( f, 0, SEEK_END);
1456    fileSize = ftell(f);
1457    fileBuf = new char[fileSize];
1458    fseek(f, 0, SEEK_SET);
1459    amt_read = fread(fileBuf, 1, fileSize, f);
1460    if (amt_read != fileSize || fileSize <= 0) {
1461        errln("Error reading test data file.");
1462        goto cleanUpAndReturn;
1463    }
1464
1465    //
1466    // Look for a Unicode Signature (BOM) on the data just read
1467    //
1468    int32_t        signatureLength;
1469    const char *   fileBufC;
1470    const char*    bomEncoding;
1471
1472    fileBufC = fileBuf;
1473    bomEncoding = ucnv_detectUnicodeSignature(
1474        fileBuf, fileSize, &signatureLength, &status);
1475    if(bomEncoding!=NULL ){
1476        fileBufC  += signatureLength;
1477        fileSize  -= signatureLength;
1478        encoding = bomEncoding;
1479    }
1480
1481    //
1482    // Open a converter to take the rule file to UTF-16
1483    //
1484    conv = ucnv_open(encoding, &status);
1485    if (U_FAILURE(status)) {
1486        goto cleanUpAndReturn;
1487    }
1488
1489    //
1490    // Convert the rules to UChar.
1491    //  Preflight first to determine required buffer size.
1492    //
1493    ulen = ucnv_toUChars(conv,
1494        NULL,           //  dest,
1495        0,              //  destCapacity,
1496        fileBufC,
1497        fileSize,
1498        &status);
1499    if (status == U_BUFFER_OVERFLOW_ERROR) {
1500        // Buffer Overflow is expected from the preflight operation.
1501        status = U_ZERO_ERROR;
1502
1503        retPtr = new UChar[ulen+1];
1504        ucnv_toUChars(conv,
1505            retPtr,       //  dest,
1506            ulen+1,
1507            fileBufC,
1508            fileSize,
1509            &status);
1510    }
1511
1512cleanUpAndReturn:
1513    fclose(f);
1514    delete []fileBuf;
1515    ucnv_close(conv);
1516    if (U_FAILURE(status)) {
1517        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1518        delete []retPtr;
1519        retPtr = 0;
1520        ulen   = 0;
1521    };
1522    return retPtr;
1523}
1524
1525
1526
1527//--------------------------------------------------------------------------------------------
1528//
1529//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1530//
1531//-------------------------------------------------------------------------------------------
1532void RBBITest::TestUnicodeFiles() {
1533    RuleBasedBreakIterator  *bi;
1534    UErrorCode               status = U_ZERO_ERROR;
1535
1536    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1537    TEST_ASSERT_SUCCESS(status);
1538    if (U_SUCCESS(status)) {
1539        runUnicodeTestData("GraphemeBreakTest.txt", bi);
1540    }
1541    delete bi;
1542
1543    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1544    TEST_ASSERT_SUCCESS(status);
1545    if (U_SUCCESS(status)) {
1546        runUnicodeTestData("WordBreakTest.txt", bi);
1547    }
1548    delete bi;
1549
1550    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1551    TEST_ASSERT_SUCCESS(status);
1552    if (U_SUCCESS(status)) {
1553        runUnicodeTestData("SentenceBreakTest.txt", bi);
1554    }
1555    delete bi;
1556
1557    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1558    TEST_ASSERT_SUCCESS(status);
1559    if (U_SUCCESS(status)) {
1560        runUnicodeTestData("LineBreakTest.txt", bi);
1561    }
1562    delete bi;
1563}
1564
1565
1566//--------------------------------------------------------------------------------------------
1567//
1568//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1569//
1570//-------------------------------------------------------------------------------------------
1571void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1572#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1573    // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1574    UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
1575    UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1576    UErrorCode  status = U_ZERO_ERROR;
1577
1578    //
1579    //  Open and read the test data file, put it into a UnicodeString.
1580    //
1581    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1582    char testFileName[1000];
1583    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1584        dataerrln("Can't open test data.  Path too long.");
1585        return;
1586    }
1587    strcpy(testFileName, testDataDirectory);
1588    strcat(testFileName, fileName);
1589
1590    logln("Opening data file %s\n", fileName);
1591
1592    int    len;
1593    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1594    if (status != U_FILE_ACCESS_ERROR) {
1595        TEST_ASSERT_SUCCESS(status);
1596        TEST_ASSERT(testFile != NULL);
1597    }
1598    if (U_FAILURE(status) || testFile == NULL) {
1599        return; /* something went wrong, error already output */
1600    }
1601    UnicodeString testFileAsString(TRUE, testFile, len);
1602
1603    //
1604    //  Parse the test data file using a regular expression.
1605    //  Each kind of token is recognized in its own capture group; what type of item was scanned
1606    //     is identified by which group had a match.
1607    //
1608    //    Caputure Group #                  1          2            3            4           5
1609    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1610    //
1611    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1612    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1613    UnicodeString   testString;
1614    UVector32       breakPositions(status);
1615    int             lineNumber = 1;
1616    TEST_ASSERT_SUCCESS(status);
1617    if (U_FAILURE(status)) {
1618        return;
1619    }
1620
1621    //
1622    //  Scan through each test case, building up the string to be broken in testString,
1623    //   and the positions that should be boundaries in the breakPositions vector.
1624    //
1625    int spin = 0;
1626    while (tokenMatcher.find()) {
1627      	if(tokenMatcher.hitEnd()) {
1628          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1629             This occurred when the text file was corrupt (wasn't marked as UTF-8)
1630             and caused an infinite loop here on EBCDIC systems!
1631          */
1632          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1633          //	   return;
1634      	}
1635        if (tokenMatcher.start(1, status) >= 0) {
1636            // Scanned a divide sign, indicating a break position in the test data.
1637            if (testString.length()>0) {
1638                breakPositions.addElement(testString.length(), status);
1639            }
1640        }
1641        else if (tokenMatcher.start(2, status) >= 0) {
1642            // Scanned an 'x', meaning no break at this position in the test data
1643            //   Nothing to be done here.
1644            }
1645        else if (tokenMatcher.start(3, status) >= 0) {
1646            // Scanned Hex digits.  Convert them to binary, append to the character data string.
1647            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1648            int length = hexNumber.length();
1649            if (length<=8) {
1650                char buf[10];
1651                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1652                UChar32 c = (UChar32)strtol(buf, NULL, 16);
1653                if (c<=0x10ffff) {
1654                    testString.append(c);
1655                } else {
1656                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1657                       fileName, lineNumber);
1658                }
1659            } else {
1660                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1661                       fileName, lineNumber);
1662             }
1663        }
1664        else if (tokenMatcher.start(4, status) >= 0) {
1665            // Scanned to end of a line, possibly skipping over a comment in the process.
1666            //   If the line from the file contained test data, run the test now.
1667            //
1668            if (testString.length() > 0) {
1669// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1670//             Rule 8
1671//                ZW SP* <break>
1672//             is not yet implemented.
1673if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1674                                            5202 == lineNumber ||
1675                                            5214 == lineNumber ||
1676                                            5246 == lineNumber ||
1677                                            5298 == lineNumber ||
1678                                            5302 == lineNumber ))) {
1679                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1680}
1681            }
1682
1683            // Clear out this test case.
1684            //    The string and breakPositions vector will be refilled as the next
1685            //       test case is parsed.
1686            testString.remove();
1687            breakPositions.removeAllElements();
1688            lineNumber++;
1689        } else {
1690            // Scanner catchall.  Something unrecognized appeared on the line.
1691            char token[16];
1692            UnicodeString uToken = tokenMatcher.group(0, status);
1693            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1694            token[sizeof(token)-1] = 0;
1695            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1696
1697            // Clean up, in preparation for continuing with the next line.
1698            testString.remove();
1699            breakPositions.removeAllElements();
1700            lineNumber++;
1701        }
1702        TEST_ASSERT_SUCCESS(status);
1703        if (U_FAILURE(status)) {
1704            break;
1705        }
1706    }
1707
1708    delete [] testFile;
1709 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1710}
1711
1712//--------------------------------------------------------------------------------------------
1713//
1714//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1715//                            test data files.  Do only a simple, forward-only check -
1716//                            this test is mostly to check that ICU and the Unicode
1717//                            data agree with each other.
1718//
1719//--------------------------------------------------------------------------------------------
1720void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1721                         const UnicodeString &testString,   // Text data to be broken
1722                         UVector32 *breakPositions,         // Positions where breaks should be found.
1723                         RuleBasedBreakIterator *bi) {
1724    int32_t pos;                 // Break Position in the test string
1725    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1726    int32_t expectedPos;         // Expected break position (index into test string)
1727
1728    bi->setText(testString);
1729    pos = bi->first();
1730    pos = bi->next();
1731
1732    while (pos != BreakIterator::DONE) {
1733        if (expectedI >= breakPositions->size()) {
1734            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1735                testFileName, lineNumber, pos);
1736            break;
1737        }
1738        expectedPos = breakPositions->elementAti(expectedI);
1739        if (pos < expectedPos) {
1740            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1741                testFileName, lineNumber, pos);
1742            break;
1743        }
1744        if (pos > expectedPos) {
1745            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1746                testFileName, lineNumber, expectedPos);
1747            break;
1748        }
1749        pos = bi->next();
1750        expectedI++;
1751    }
1752
1753    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1754        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1755            testFileName, lineNumber, breakPositions->elementAti(expectedI));
1756    }
1757}
1758
1759
1760
1761#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1762//---------------------------------------------------------------------------------------
1763//
1764//   classs RBBIMonkeyKind
1765//
1766//      Monkey Test for Break Iteration
1767//      Abstract interface class.   Concrete derived classes independently
1768//      implement the break rules for different iterator types.
1769//
1770//      The Monkey Test itself uses doesn't know which type of break iterator it is
1771//      testing, but works purely in terms of the interface defined here.
1772//
1773//---------------------------------------------------------------------------------------
1774class RBBIMonkeyKind {
1775public:
1776    // Return a UVector of UnicodeSets, representing the character classes used
1777    //   for this type of iterator.
1778    virtual  UVector  *charClasses() = 0;
1779
1780    // Set the test text on which subsequent calls to next() will operate
1781    virtual  void      setText(const UnicodeString &s) = 0;
1782
1783    // Find the next break postion, starting from the prev break position, or from zero.
1784    // Return -1 after reaching end of string.
1785    virtual  int32_t   next(int32_t i) = 0;
1786
1787    virtual ~RBBIMonkeyKind();
1788    UErrorCode       deferredStatus;
1789
1790
1791protected:
1792    RBBIMonkeyKind();
1793
1794private:
1795};
1796
1797RBBIMonkeyKind::RBBIMonkeyKind() {
1798    deferredStatus = U_ZERO_ERROR;
1799}
1800
1801RBBIMonkeyKind::~RBBIMonkeyKind() {
1802}
1803
1804
1805//----------------------------------------------------------------------------------------
1806//
1807//   Random Numbers.  Similar to standard lib rand() and srand()
1808//                    Not using library to
1809//                      1.  Get same results on all platforms.
1810//                      2.  Get access to current seed, to more easily reproduce failures.
1811//
1812//---------------------------------------------------------------------------------------
1813static uint32_t m_seed = 1;
1814
1815static uint32_t m_rand()
1816{
1817    m_seed = m_seed * 1103515245 + 12345;
1818    return (uint32_t)(m_seed/65536) % 32768;
1819}
1820
1821
1822//------------------------------------------------------------------------------------------
1823//
1824//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1825//                             of RBBIMonkeyKind.
1826//
1827//------------------------------------------------------------------------------------------
1828class RBBICharMonkey: public RBBIMonkeyKind {
1829public:
1830    RBBICharMonkey();
1831    virtual          ~RBBICharMonkey();
1832    virtual  UVector *charClasses();
1833    virtual  void     setText(const UnicodeString &s);
1834    virtual  int32_t  next(int32_t i);
1835private:
1836    UVector   *fSets;
1837
1838    UnicodeSet  *fCRLFSet;
1839    UnicodeSet  *fControlSet;
1840    UnicodeSet  *fExtendSet;
1841    UnicodeSet  *fRegionalIndicatorSet;
1842    UnicodeSet  *fPrependSet;
1843    UnicodeSet  *fSpacingSet;
1844    UnicodeSet  *fLSet;
1845    UnicodeSet  *fVSet;
1846    UnicodeSet  *fTSet;
1847    UnicodeSet  *fLVSet;
1848    UnicodeSet  *fLVTSet;
1849    UnicodeSet  *fHangulSet;
1850    UnicodeSet  *fAnySet;
1851
1852    const UnicodeString *fText;
1853};
1854
1855
1856RBBICharMonkey::RBBICharMonkey() {
1857    UErrorCode  status = U_ZERO_ERROR;
1858
1859    fText = NULL;
1860
1861    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1862    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1863    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1864    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1865    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1866    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1867    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1868    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1869    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1870    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1871    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1872    fHangulSet  = new UnicodeSet();
1873    fHangulSet->addAll(*fLSet);
1874    fHangulSet->addAll(*fVSet);
1875    fHangulSet->addAll(*fTSet);
1876    fHangulSet->addAll(*fLVSet);
1877    fHangulSet->addAll(*fLVTSet);
1878    fAnySet     = new UnicodeSet(0, 0x10ffff);
1879
1880    fSets       = new UVector(status);
1881    fSets->addElement(fCRLFSet,    status);
1882    fSets->addElement(fControlSet, status);
1883    fSets->addElement(fExtendSet,  status);
1884    fSets->addElement(fRegionalIndicatorSet, status);
1885    if (!fPrependSet->isEmpty()) {
1886        fSets->addElement(fPrependSet, status);
1887    }
1888    fSets->addElement(fSpacingSet, status);
1889    fSets->addElement(fHangulSet,  status);
1890    fSets->addElement(fAnySet,     status);
1891    if (U_FAILURE(status)) {
1892        deferredStatus = status;
1893    }
1894}
1895
1896
1897void RBBICharMonkey::setText(const UnicodeString &s) {
1898    fText = &s;
1899}
1900
1901
1902
1903int32_t RBBICharMonkey::next(int32_t prevPos) {
1904    int    p0, p1, p2, p3;    // Indices of the significant code points around the
1905                              //   break position being tested.  The candidate break
1906                              //   location is before p2.
1907
1908    int     breakPos = -1;
1909
1910    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1911
1912    if (U_FAILURE(deferredStatus)) {
1913        return -1;
1914    }
1915
1916    // Previous break at end of string.  return DONE.
1917    if (prevPos >= fText->length()) {
1918        return -1;
1919    }
1920    p0 = p1 = p2 = p3 = prevPos;
1921    c3 =  fText->char32At(prevPos);
1922    c0 = c1 = c2 = 0;
1923
1924    // Loop runs once per "significant" character position in the input text.
1925    for (;;) {
1926        // Move all of the positions forward in the input string.
1927        p0 = p1;  c0 = c1;
1928        p1 = p2;  c1 = c2;
1929        p2 = p3;  c2 = c3;
1930
1931        // Advancd p3 by one codepoint
1932        p3 = fText->moveIndex32(p3, 1);
1933        c3 = fText->char32At(p3);
1934
1935        if (p1 == p2) {
1936            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1937            continue;
1938        }
1939        if (p2 == fText->length()) {
1940            // Reached end of string.  Always a break position.
1941            break;
1942        }
1943
1944        // Rule  GB3   CR x LF
1945        //     No Extend or Format characters may appear between the CR and LF,
1946        //     which requires the additional check for p2 immediately following p1.
1947        //
1948        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1949            continue;
1950        }
1951
1952        // Rule (GB4).   ( Control | CR | LF ) <break>
1953        if (fControlSet->contains(c1) ||
1954            c1 == 0x0D ||
1955            c1 == 0x0A)  {
1956            break;
1957        }
1958
1959        // Rule (GB5)    <break>  ( Control | CR | LF )
1960        //
1961        if (fControlSet->contains(c2) ||
1962            c2 == 0x0D ||
1963            c2 == 0x0A)  {
1964            break;
1965        }
1966
1967
1968        // Rule (GB6)  L x ( L | V | LV | LVT )
1969        if (fLSet->contains(c1) &&
1970               (fLSet->contains(c2)  ||
1971                fVSet->contains(c2)  ||
1972                fLVSet->contains(c2) ||
1973                fLVTSet->contains(c2))) {
1974            continue;
1975        }
1976
1977        // Rule (GB7)    ( LV | V )  x  ( V | T )
1978        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1979            (fVSet->contains(c2) || fTSet->contains(c2)))  {
1980            continue;
1981        }
1982
1983        // Rule (GB8)    ( LVT | T)  x T
1984        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1985            fTSet->contains(c2))  {
1986            continue;
1987        }
1988
1989        // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1990
1991        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1992        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1993            continue;
1994        }
1995
1996        // Rule (GB9)    Numeric x ALetter
1997        if (fExtendSet->contains(c2))  {
1998            continue;
1999        }
2000
2001        // Rule (GB9a)   x  SpacingMark
2002        if (fSpacingSet->contains(c2)) {
2003            continue;
2004        }
2005
2006        // Rule (GB9b)   Prepend x
2007        if (fPrependSet->contains(c1)) {
2008            continue;
2009        }
2010
2011        // Rule (GB10)  Any  <break>  Any
2012        break;
2013    }
2014
2015    breakPos = p2;
2016    return breakPos;
2017}
2018
2019
2020
2021UVector  *RBBICharMonkey::charClasses() {
2022    return fSets;
2023}
2024
2025
2026RBBICharMonkey::~RBBICharMonkey() {
2027    delete fSets;
2028    delete fCRLFSet;
2029    delete fControlSet;
2030    delete fExtendSet;
2031    delete fRegionalIndicatorSet;
2032    delete fPrependSet;
2033    delete fSpacingSet;
2034    delete fLSet;
2035    delete fVSet;
2036    delete fTSet;
2037    delete fLVSet;
2038    delete fLVTSet;
2039    delete fHangulSet;
2040    delete fAnySet;
2041}
2042
2043//------------------------------------------------------------------------------------------
2044//
2045//   class RBBIWordMonkey      Word Break specific implementation
2046//                             of RBBIMonkeyKind.
2047//
2048//------------------------------------------------------------------------------------------
2049class RBBIWordMonkey: public RBBIMonkeyKind {
2050public:
2051    RBBIWordMonkey();
2052    virtual          ~RBBIWordMonkey();
2053    virtual  UVector *charClasses();
2054    virtual  void     setText(const UnicodeString &s);
2055    virtual int32_t   next(int32_t i);
2056private:
2057    UVector      *fSets;
2058
2059    UnicodeSet  *fCRSet;
2060    UnicodeSet  *fLFSet;
2061    UnicodeSet  *fNewlineSet;
2062    UnicodeSet  *fKatakanaSet;
2063    UnicodeSet  *fALetterSet;
2064    // TODO(jungshik): Do we still need this change?
2065    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2066    UnicodeSet  *fMidNumLetSet;
2067    UnicodeSet  *fMidLetterSet;
2068    UnicodeSet  *fMidNumSet;
2069    UnicodeSet  *fNumericSet;
2070    UnicodeSet  *fFormatSet;
2071    UnicodeSet  *fOtherSet;
2072    UnicodeSet  *fExtendSet;
2073    UnicodeSet  *fExtendNumLetSet;
2074    UnicodeSet  *fRegionalIndicatorSet;
2075    UnicodeSet  *fDictionaryCjkSet;
2076
2077    RegexMatcher  *fMatcher;
2078
2079    const UnicodeString  *fText;
2080};
2081
2082
2083RBBIWordMonkey::RBBIWordMonkey()
2084{
2085    UErrorCode  status = U_ZERO_ERROR;
2086
2087    fSets            = new UVector(status);
2088
2089    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2090    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2091    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2092    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2093    // Exclude Hangul syllables from ALetterSet during testing.
2094    // Leave CJK dictionary characters out from the monkey tests!
2095#if 0
2096    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2097                                      "[\\p{Line_Break = Complex_Context}"
2098                                      "-\\p{Grapheme_Cluster_Break = Extend}"
2099                                      "-\\p{Grapheme_Cluster_Break = Control}"
2100                                      "]]",
2101                                      status);
2102#endif
2103    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2104    fALetterSet->removeAll(*fDictionaryCjkSet);
2105    fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2106    fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2107    fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2108    fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2109    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2110    // we should figure out why
2111    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2112    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2113    fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2114    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2115    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2116
2117    fOtherSet        = new UnicodeSet();
2118    if(U_FAILURE(status)) {
2119      deferredStatus = status;
2120      return;
2121    }
2122
2123    fOtherSet->complement();
2124    fOtherSet->removeAll(*fCRSet);
2125    fOtherSet->removeAll(*fLFSet);
2126    fOtherSet->removeAll(*fNewlineSet);
2127    fOtherSet->removeAll(*fKatakanaSet);
2128    fOtherSet->removeAll(*fALetterSet);
2129    fOtherSet->removeAll(*fMidLetterSet);
2130    fOtherSet->removeAll(*fMidNumSet);
2131    fOtherSet->removeAll(*fNumericSet);
2132    fOtherSet->removeAll(*fExtendNumLetSet);
2133    fOtherSet->removeAll(*fFormatSet);
2134    fOtherSet->removeAll(*fExtendSet);
2135    fOtherSet->removeAll(*fRegionalIndicatorSet);
2136    // Inhibit dictionary characters from being tested at all.
2137    fOtherSet->removeAll(*fDictionaryCjkSet);
2138    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2139
2140    fSets->addElement(fCRSet,        status);
2141    fSets->addElement(fLFSet,        status);
2142    fSets->addElement(fNewlineSet,   status);
2143    fSets->addElement(fALetterSet,   status);
2144    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
2145    fSets->addElement(fMidLetterSet, status);
2146    fSets->addElement(fMidNumLetSet, status);
2147    fSets->addElement(fMidNumSet,    status);
2148    fSets->addElement(fNumericSet,   status);
2149    fSets->addElement(fFormatSet,    status);
2150    fSets->addElement(fExtendSet,    status);
2151    fSets->addElement(fOtherSet,     status);
2152    fSets->addElement(fExtendNumLetSet, status);
2153    fSets->addElement(fRegionalIndicatorSet, status);
2154
2155    if (U_FAILURE(status)) {
2156        deferredStatus = status;
2157    }
2158}
2159
2160void RBBIWordMonkey::setText(const UnicodeString &s) {
2161    fText       = &s;
2162}
2163
2164
2165int32_t RBBIWordMonkey::next(int32_t prevPos) {
2166    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2167                              //   break position being tested.  The candidate break
2168                              //   location is before p2.
2169
2170    int     breakPos = -1;
2171
2172    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2173
2174    if (U_FAILURE(deferredStatus)) {
2175        return -1;
2176    }
2177
2178    // Prev break at end of string.  return DONE.
2179    if (prevPos >= fText->length()) {
2180        return -1;
2181    }
2182    p0 = p1 = p2 = p3 = prevPos;
2183    c3 =  fText->char32At(prevPos);
2184    c0 = c1 = c2 = 0;
2185
2186    // Loop runs once per "significant" character position in the input text.
2187    for (;;) {
2188        // Move all of the positions forward in the input string.
2189        p0 = p1;  c0 = c1;
2190        p1 = p2;  c1 = c2;
2191        p2 = p3;  c2 = c3;
2192
2193        // Advancd p3 by    X(Extend | Format)*   Rule 4
2194        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2195        do {
2196            p3 = fText->moveIndex32(p3, 1);
2197            c3 = fText->char32At(p3);
2198            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2199               break;
2200            };
2201        }
2202        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2203
2204
2205        if (p1 == p2) {
2206            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2207            continue;
2208        }
2209        if (p2 == fText->length()) {
2210            // Reached end of string.  Always a break position.
2211            break;
2212        }
2213
2214        // Rule  (3)   CR x LF
2215        //     No Extend or Format characters may appear between the CR and LF,
2216        //     which requires the additional check for p2 immediately following p1.
2217        //
2218        if (c1==0x0D && c2==0x0A) {
2219            continue;
2220        }
2221
2222        // Rule (3a)  Break before and after newlines (including CR and LF)
2223        //
2224        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2225            break;
2226        };
2227        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2228            break;
2229        };
2230
2231        // Rule (5).   ALetter x ALetter
2232        if (fALetterSet->contains(c1) &&
2233            fALetterSet->contains(c2))  {
2234            continue;
2235        }
2236
2237        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2238        //
2239        if ( fALetterSet->contains(c1)   &&
2240             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2241             fALetterSet->contains(c3)) {
2242            continue;
2243        }
2244
2245
2246        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2247        if (fALetterSet->contains(c0) &&
2248            (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2249            fALetterSet->contains(c2)) {
2250            continue;
2251        }
2252
2253        // Rule (8)    Numeric x Numeric
2254        if (fNumericSet->contains(c1) &&
2255            fNumericSet->contains(c2))  {
2256            continue;
2257        }
2258
2259        // Rule (9)    ALetter x Numeric
2260        if (fALetterSet->contains(c1) &&
2261            fNumericSet->contains(c2))  {
2262            continue;
2263        }
2264
2265        // Rule (10)    Numeric x ALetter
2266        if (fNumericSet->contains(c1) &&
2267            fALetterSet->contains(c2))  {
2268            continue;
2269        }
2270
2271        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2272        if (fNumericSet->contains(c0) &&
2273            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2274            fNumericSet->contains(c2)) {
2275            continue;
2276        }
2277
2278        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2279        if (fNumericSet->contains(c1) &&
2280            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2281            fNumericSet->contains(c3)) {
2282            continue;
2283        }
2284
2285        // Rule (13)  Katakana x Katakana
2286        if (fKatakanaSet->contains(c1) &&
2287            fKatakanaSet->contains(c2))  {
2288            continue;
2289        }
2290
2291        // Rule 13a
2292        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2293             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2294             fExtendNumLetSet->contains(c2)) {
2295                continue;
2296        }
2297
2298        // Rule 13b
2299        if (fExtendNumLetSet->contains(c1) &&
2300                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2301                fKatakanaSet->contains(c2)))  {
2302                continue;
2303        }
2304
2305        // Rule 13c
2306        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2307            continue;
2308        }
2309
2310        // Rule 14.  Break found here.
2311        break;
2312    }
2313
2314    breakPos = p2;
2315    return breakPos;
2316}
2317
2318
2319UVector  *RBBIWordMonkey::charClasses() {
2320    return fSets;
2321}
2322
2323
2324RBBIWordMonkey::~RBBIWordMonkey() {
2325    delete fSets;
2326    delete fCRSet;
2327    delete fLFSet;
2328    delete fNewlineSet;
2329    delete fKatakanaSet;
2330    delete fALetterSet;
2331    delete fMidNumLetSet;
2332    delete fMidLetterSet;
2333    delete fMidNumSet;
2334    delete fNumericSet;
2335    delete fFormatSet;
2336    delete fExtendSet;
2337    delete fExtendNumLetSet;
2338    delete fRegionalIndicatorSet;
2339    delete fDictionaryCjkSet;
2340    delete fOtherSet;
2341}
2342
2343
2344
2345
2346//------------------------------------------------------------------------------------------
2347//
2348//   class RBBISentMonkey      Sentence Break specific implementation
2349//                             of RBBIMonkeyKind.
2350//
2351//------------------------------------------------------------------------------------------
2352class RBBISentMonkey: public RBBIMonkeyKind {
2353public:
2354    RBBISentMonkey();
2355    virtual          ~RBBISentMonkey();
2356    virtual  UVector *charClasses();
2357    virtual  void     setText(const UnicodeString &s);
2358    virtual int32_t   next(int32_t i);
2359private:
2360    int               moveBack(int posFrom);
2361    int               moveForward(int posFrom);
2362    UChar32           cAt(int pos);
2363
2364    UVector      *fSets;
2365
2366    UnicodeSet  *fSepSet;
2367    UnicodeSet  *fFormatSet;
2368    UnicodeSet  *fSpSet;
2369    UnicodeSet  *fLowerSet;
2370    UnicodeSet  *fUpperSet;
2371    UnicodeSet  *fOLetterSet;
2372    UnicodeSet  *fNumericSet;
2373    UnicodeSet  *fATermSet;
2374    UnicodeSet  *fSContinueSet;
2375    UnicodeSet  *fSTermSet;
2376    UnicodeSet  *fCloseSet;
2377    UnicodeSet  *fOtherSet;
2378    UnicodeSet  *fExtendSet;
2379
2380    const UnicodeString  *fText;
2381
2382};
2383
2384RBBISentMonkey::RBBISentMonkey()
2385{
2386    UErrorCode  status = U_ZERO_ERROR;
2387
2388    fSets            = new UVector(status);
2389
2390    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2391    //                       set and made into character classes of their own.  For the monkey impl,
2392    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2393    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2394    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2395    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2396    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2397    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2398    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2399    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2400    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2401    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2402    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2403    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2404    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2405    fOtherSet        = new UnicodeSet();
2406
2407    if(U_FAILURE(status)) {
2408      deferredStatus = status;
2409      return;
2410    }
2411
2412    fOtherSet->complement();
2413    fOtherSet->removeAll(*fSepSet);
2414    fOtherSet->removeAll(*fFormatSet);
2415    fOtherSet->removeAll(*fSpSet);
2416    fOtherSet->removeAll(*fLowerSet);
2417    fOtherSet->removeAll(*fUpperSet);
2418    fOtherSet->removeAll(*fOLetterSet);
2419    fOtherSet->removeAll(*fNumericSet);
2420    fOtherSet->removeAll(*fATermSet);
2421    fOtherSet->removeAll(*fSContinueSet);
2422    fOtherSet->removeAll(*fSTermSet);
2423    fOtherSet->removeAll(*fCloseSet);
2424    fOtherSet->removeAll(*fExtendSet);
2425
2426    fSets->addElement(fSepSet,       status);
2427    fSets->addElement(fFormatSet,    status);
2428    fSets->addElement(fSpSet,        status);
2429    fSets->addElement(fLowerSet,     status);
2430    fSets->addElement(fUpperSet,     status);
2431    fSets->addElement(fOLetterSet,   status);
2432    fSets->addElement(fNumericSet,   status);
2433    fSets->addElement(fATermSet,     status);
2434    fSets->addElement(fSContinueSet, status);
2435    fSets->addElement(fSTermSet,     status);
2436    fSets->addElement(fCloseSet,     status);
2437    fSets->addElement(fOtherSet,     status);
2438    fSets->addElement(fExtendSet,    status);
2439
2440    if (U_FAILURE(status)) {
2441        deferredStatus = status;
2442    }
2443}
2444
2445
2446
2447void RBBISentMonkey::setText(const UnicodeString &s) {
2448    fText       = &s;
2449}
2450
2451UVector  *RBBISentMonkey::charClasses() {
2452    return fSets;
2453}
2454
2455
2456//  moveBack()   Find the "significant" code point preceding the index i.
2457//               Skips over ($Extend | $Format)* .
2458//
2459int RBBISentMonkey::moveBack(int i) {
2460    if (i <= 0) {
2461        return -1;
2462    }
2463    UChar32   c;
2464    int32_t   j = i;
2465    do {
2466        j = fText->moveIndex32(j, -1);
2467        c = fText->char32At(j);
2468    }
2469    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2470    return j;
2471
2472 }
2473
2474
2475int RBBISentMonkey::moveForward(int i) {
2476    if (i>=fText->length()) {
2477        return fText->length();
2478    }
2479    UChar32   c;
2480    int32_t   j = i;
2481    do {
2482        j = fText->moveIndex32(j, 1);
2483        c = cAt(j);
2484    }
2485    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2486    return j;
2487}
2488
2489UChar32 RBBISentMonkey::cAt(int pos) {
2490    if (pos<0 || pos>=fText->length()) {
2491        return -1;
2492    } else {
2493        return fText->char32At(pos);
2494    }
2495}
2496
2497int32_t RBBISentMonkey::next(int32_t prevPos) {
2498    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2499                              //   break position being tested.  The candidate break
2500                              //   location is before p2.
2501
2502    int     breakPos = -1;
2503
2504    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2505    UChar32 c;
2506
2507    if (U_FAILURE(deferredStatus)) {
2508        return -1;
2509    }
2510
2511    // Prev break at end of string.  return DONE.
2512    if (prevPos >= fText->length()) {
2513        return -1;
2514    }
2515    p0 = p1 = p2 = p3 = prevPos;
2516    c3 =  fText->char32At(prevPos);
2517    c0 = c1 = c2 = 0;
2518
2519    // Loop runs once per "significant" character position in the input text.
2520    for (;;) {
2521        // Move all of the positions forward in the input string.
2522        p0 = p1;  c0 = c1;
2523        p1 = p2;  c1 = c2;
2524        p2 = p3;  c2 = c3;
2525
2526        // Advancd p3 by    X(Extend | Format)*   Rule 4
2527        p3 = moveForward(p3);
2528        c3 = cAt(p3);
2529
2530        // Rule (3)  CR x LF
2531        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2532            continue;
2533        }
2534
2535        // Rule (4).   Sep  <break>
2536        if (fSepSet->contains(c1)) {
2537            p2 = p1+1;   // Separators don't combine with Extend or Format.
2538            break;
2539        }
2540
2541        if (p2 >= fText->length()) {
2542            // Reached end of string.  Always a break position.
2543            break;
2544        }
2545
2546        if (p2 == prevPos) {
2547            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2548            continue;
2549        }
2550
2551        // Rule (6).   ATerm x Numeric
2552        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2553            continue;
2554        }
2555
2556        // Rule (7).  Upper ATerm  x  Uppper
2557        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2558            continue;
2559        }
2560
2561        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2562        //           Note:  STerm | ATerm are added to the negated part of the expression by a
2563        //                  note to the Unicode 5.0 documents.
2564        int p8 = p1;
2565        while (fSpSet->contains(cAt(p8))) {
2566            p8 = moveBack(p8);
2567        }
2568        while (fCloseSet->contains(cAt(p8))) {
2569            p8 = moveBack(p8);
2570        }
2571        if (fATermSet->contains(cAt(p8))) {
2572            p8=p2;
2573            for (;;) {
2574                c = cAt(p8);
2575                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2576                    fLowerSet->contains(c) || fSepSet->contains(c) ||
2577                    fATermSet->contains(c) || fSTermSet->contains(c))  {
2578                    break;
2579                }
2580                p8 = moveForward(p8);
2581            }
2582            if (fLowerSet->contains(cAt(p8))) {
2583                continue;
2584            }
2585        }
2586
2587        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2588        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2589            p8 = p1;
2590            while (fSpSet->contains(cAt(p8))) {
2591                p8 = moveBack(p8);
2592            }
2593            while (fCloseSet->contains(cAt(p8))) {
2594                p8 = moveBack(p8);
2595            }
2596            c = cAt(p8);
2597            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2598                continue;
2599            }
2600        }
2601
2602        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2603        int p9 = p1;
2604        while (fCloseSet->contains(cAt(p9))) {
2605            p9 = moveBack(p9);
2606        }
2607        c = cAt(p9);
2608        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2609            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2610                continue;
2611            }
2612        }
2613
2614        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2615        int p10 = p1;
2616        while (fSpSet->contains(cAt(p10))) {
2617            p10 = moveBack(p10);
2618        }
2619        while (fCloseSet->contains(cAt(p10))) {
2620            p10 = moveBack(p10);
2621        }
2622        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2623            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2624                continue;
2625            }
2626        }
2627
2628        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2629        int p11 = p1;
2630        if (fSepSet->contains(cAt(p11))) {
2631            p11 = moveBack(p11);
2632        }
2633        while (fSpSet->contains(cAt(p11))) {
2634            p11 = moveBack(p11);
2635        }
2636        while (fCloseSet->contains(cAt(p11))) {
2637            p11 = moveBack(p11);
2638        }
2639        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2640            break;
2641        }
2642
2643        //  Rule (12)  Any x Any
2644        continue;
2645    }
2646    breakPos = p2;
2647    return breakPos;
2648}
2649
2650RBBISentMonkey::~RBBISentMonkey() {
2651    delete fSets;
2652    delete fSepSet;
2653    delete fFormatSet;
2654    delete fSpSet;
2655    delete fLowerSet;
2656    delete fUpperSet;
2657    delete fOLetterSet;
2658    delete fNumericSet;
2659    delete fATermSet;
2660    delete fSContinueSet;
2661    delete fSTermSet;
2662    delete fCloseSet;
2663    delete fOtherSet;
2664    delete fExtendSet;
2665}
2666
2667
2668
2669//-------------------------------------------------------------------------------------------
2670//
2671//  RBBILineMonkey
2672//
2673//-------------------------------------------------------------------------------------------
2674
2675class RBBILineMonkey: public RBBIMonkeyKind {
2676public:
2677    RBBILineMonkey();
2678    virtual          ~RBBILineMonkey();
2679    virtual  UVector *charClasses();
2680    virtual  void     setText(const UnicodeString &s);
2681    virtual  int32_t  next(int32_t i);
2682    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2683private:
2684    UVector      *fSets;
2685
2686    UnicodeSet  *fBK;
2687    UnicodeSet  *fCR;
2688    UnicodeSet  *fLF;
2689    UnicodeSet  *fCM;
2690    UnicodeSet  *fNL;
2691    UnicodeSet  *fSG;
2692    UnicodeSet  *fWJ;
2693    UnicodeSet  *fZW;
2694    UnicodeSet  *fGL;
2695    UnicodeSet  *fCB;
2696    UnicodeSet  *fSP;
2697    UnicodeSet  *fB2;
2698    UnicodeSet  *fBA;
2699    UnicodeSet  *fBB;
2700    UnicodeSet  *fHY;
2701    UnicodeSet  *fH2;
2702    UnicodeSet  *fH3;
2703    UnicodeSet  *fCL;
2704    UnicodeSet  *fCP;
2705    UnicodeSet  *fEX;
2706    UnicodeSet  *fIN;
2707    UnicodeSet  *fJL;
2708    UnicodeSet  *fJV;
2709    UnicodeSet  *fJT;
2710    UnicodeSet  *fNS;
2711    UnicodeSet  *fOP;
2712    UnicodeSet  *fQU;
2713    UnicodeSet  *fIS;
2714    UnicodeSet  *fNU;
2715    UnicodeSet  *fPO;
2716    UnicodeSet  *fPR;
2717    UnicodeSet  *fSY;
2718    UnicodeSet  *fAI;
2719    UnicodeSet  *fAL;
2720    UnicodeSet  *fCJ;
2721    UnicodeSet  *fHL;
2722    UnicodeSet  *fID;
2723    UnicodeSet  *fRI;
2724    UnicodeSet  *fSA;
2725    UnicodeSet  *fXX;
2726
2727    BreakIterator  *fCharBI;
2728
2729    const UnicodeString  *fText;
2730    int32_t              *fOrigPositions;
2731
2732    RegexMatcher         *fNumberMatcher;
2733    RegexMatcher         *fLB11Matcher;
2734};
2735
2736
2737RBBILineMonkey::RBBILineMonkey()
2738{
2739    UErrorCode  status = U_ZERO_ERROR;
2740
2741    fSets  = new UVector(status);
2742
2743    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2744    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2745    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2746    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2747    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2748    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2749    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2750    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2751    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2752    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2753    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2754    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2755    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2756    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2757    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2758    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2759    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2760    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2761    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2762    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2763    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2764    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2765    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2766    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2767    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2768    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2769    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2770    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2771    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2772    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2773    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2774    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2775    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2776    fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2777    fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2778    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2779    fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2780    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2781    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2782    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2783
2784    if (U_FAILURE(status)) {
2785        deferredStatus = status;
2786        fCharBI = NULL;
2787        fNumberMatcher = NULL;
2788        return;
2789    }
2790
2791    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2792    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2793    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2794    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2795
2796    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2797
2798    fSets->addElement(fBK, status);
2799    fSets->addElement(fCR, status);
2800    fSets->addElement(fLF, status);
2801    fSets->addElement(fCM, status);
2802    fSets->addElement(fNL, status);
2803    fSets->addElement(fWJ, status);
2804    fSets->addElement(fZW, status);
2805    fSets->addElement(fGL, status);
2806    fSets->addElement(fCB, status);
2807    fSets->addElement(fSP, status);
2808    fSets->addElement(fB2, status);
2809    fSets->addElement(fBA, status);
2810    fSets->addElement(fBB, status);
2811    fSets->addElement(fHY, status);
2812    fSets->addElement(fH2, status);
2813    fSets->addElement(fH3, status);
2814    fSets->addElement(fCL, status);
2815    fSets->addElement(fCP, status);
2816    fSets->addElement(fEX, status);
2817    fSets->addElement(fIN, status);
2818    fSets->addElement(fJL, status);
2819    fSets->addElement(fJT, status);
2820    fSets->addElement(fJV, status);
2821    fSets->addElement(fNS, status);
2822    fSets->addElement(fOP, status);
2823    fSets->addElement(fQU, status);
2824    fSets->addElement(fIS, status);
2825    fSets->addElement(fNU, status);
2826    fSets->addElement(fPO, status);
2827    fSets->addElement(fPR, status);
2828    fSets->addElement(fSY, status);
2829    fSets->addElement(fAI, status);
2830    fSets->addElement(fAL, status);
2831    fSets->addElement(fHL, status);
2832    fSets->addElement(fID, status);
2833    fSets->addElement(fWJ, status);
2834    fSets->addElement(fRI, status);
2835    fSets->addElement(fSA, status);
2836    fSets->addElement(fSG, status);
2837
2838    const char *rules =
2839            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2840            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2841            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2842            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2843            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2844            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2845
2846    fNumberMatcher = new RegexMatcher(
2847        UnicodeString(rules, -1, US_INV), 0, status);
2848
2849    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2850
2851    if (U_FAILURE(status)) {
2852        deferredStatus = status;
2853    }
2854}
2855
2856
2857void RBBILineMonkey::setText(const UnicodeString &s) {
2858    fText       = &s;
2859    fCharBI->setText(s);
2860    fNumberMatcher->reset(s);
2861}
2862
2863//
2864//  rule9Adjust
2865//     Line Break TR rules 9 and 10 implementation.
2866//     This deals with combining marks and other sequences that
2867//     that must be treated as if they were something other than what they actually are.
2868//
2869//     This is factored out into a separate function because it must be applied twice for
2870//     each potential break, once to the chars before the position being checked, then
2871//     again to the text following the possible break.
2872//
2873void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2874    if (pos == -1) {
2875        // Invalid initial position.  Happens during the warmup iteration of the
2876        //   main loop in next().
2877        return;
2878    }
2879
2880    int32_t  nPos = *nextPos;
2881
2882    // LB 9  Keep combining sequences together.
2883    //  advance over any CM class chars.  Note that Line Break CM is different
2884    //  from the normal Grapheme Extend property.
2885    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2886          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2887        for (;;) {
2888            *nextChar = fText->char32At(nPos);
2889            if (!fCM->contains(*nextChar)) {
2890                break;
2891            }
2892            nPos = fText->moveIndex32(nPos, 1);
2893        }
2894    }
2895
2896
2897    // LB 9 Treat X CM* as if it were x.
2898    //       No explicit action required.
2899
2900    // LB 10  Treat any remaining combining mark as AL
2901    if (fCM->contains(*posChar)) {
2902        *posChar = 0x41;   // thisChar = 'A';
2903    }
2904
2905    // Push the updated nextPos and nextChar back to our caller.
2906    // This only makes a difference if posChar got bigger by consuming a
2907    // combining sequence.
2908    *nextPos  = nPos;
2909    *nextChar = fText->char32At(nPos);
2910}
2911
2912
2913
2914int32_t RBBILineMonkey::next(int32_t startPos) {
2915    UErrorCode status = U_ZERO_ERROR;
2916    int32_t    pos;       //  Index of the char following a potential break position
2917    UChar32    thisChar;  //  Character at above position "pos"
2918
2919    int32_t    prevPos;   //  Index of the char preceding a potential break position
2920    UChar32    prevChar;  //  Character at above position.  Note that prevChar
2921                          //   and thisChar may not be adjacent because combining
2922                          //   characters between them will be ignored.
2923
2924    int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2925    UChar32    prevCharX2;
2926
2927    int32_t    nextPos;   //  Index of the next character following pos.
2928                          //     Usually skips over combining marks.
2929    int32_t    nextCPPos; //  Index of the code point following "pos."
2930                          //     May point to a combining mark.
2931    int32_t    tPos;      //  temp value.
2932    UChar32    c;
2933
2934    if (U_FAILURE(deferredStatus)) {
2935        return -1;
2936    }
2937
2938    if (startPos >= fText->length()) {
2939        return -1;
2940    }
2941
2942
2943    // Initial values for loop.  Loop will run the first time without finding breaks,
2944    //                           while the invalid values shift out and the "this" and
2945    //                           "prev" positions are filled in with good values.
2946    pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2947    thisChar = prevChar  = prevCharX2 = 0;
2948    nextPos  = nextCPPos = startPos;
2949
2950
2951    // Loop runs once per position in the test text, until a break position
2952    //  is found.
2953    for (;;) {
2954        prevPosX2 = prevPos;
2955        prevCharX2 = prevChar;
2956
2957        prevPos   = pos;
2958        prevChar  = thisChar;
2959
2960        pos       = nextPos;
2961        thisChar  = fText->char32At(pos);
2962
2963        nextCPPos = fText->moveIndex32(pos, 1);
2964        nextPos   = nextCPPos;
2965
2966        // Rule LB2 - Break at end of text.
2967        if (pos >= fText->length()) {
2968            break;
2969        }
2970
2971        // Rule LB 9 - adjust for combining sequences.
2972        //             We do this one out-of-order because the adjustment does not change anything
2973        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2974        //             be applied.
2975        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2976        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2977        c = fText->char32At(nextPos);
2978        rule9Adjust(pos,     &thisChar, &nextPos, &c);
2979
2980        // If the loop is still warming up - if we haven't shifted the initial
2981        //   -1 positions out of prevPos yet - loop back to advance the
2982        //    position in the input without any further looking for breaks.
2983        if (prevPos == -1) {
2984            continue;
2985        }
2986
2987        // LB 4  Always break after hard line breaks,
2988        if (fBK->contains(prevChar)) {
2989            break;
2990        }
2991
2992        // LB 5  Break after CR, LF, NL, but not inside CR LF
2993        if (prevChar == 0x0d && thisChar == 0x0a) {
2994            continue;
2995        }
2996        if (prevChar == 0x0d ||
2997            prevChar == 0x0a ||
2998            prevChar == 0x85)  {
2999            break;
3000        }
3001
3002        // LB 6  Don't break before hard line breaks
3003        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3004            fBK->contains(thisChar)) {
3005                continue;
3006        }
3007
3008
3009        // LB 7  Don't break before spaces or zero-width space.
3010        if (fSP->contains(thisChar)) {
3011            continue;
3012        }
3013
3014        if (fZW->contains(thisChar)) {
3015            continue;
3016        }
3017
3018        // LB 8  Break after zero width space
3019        if (fZW->contains(prevChar)) {
3020            break;
3021        }
3022
3023        // LB 9, 10  Already done, at top of loop.
3024        //
3025
3026
3027        // LB 11  Do not break before or after WORD JOINER and related characters.
3028        //    x  WJ
3029        //    WJ  x
3030        //
3031        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3032            continue;
3033        }
3034
3035        // LB 12
3036        //    GL  x
3037        if (fGL->contains(prevChar)) {
3038            continue;
3039        }
3040
3041        // LB 12a
3042        //    [^SP BA HY] x GL
3043        if (!(fSP->contains(prevChar) ||
3044              fBA->contains(prevChar) ||
3045              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3046            continue;
3047        }
3048
3049
3050
3051        // LB 13  Don't break before closings.
3052        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3053        //        fall into LB 17 and the more general number regular expression.
3054        //
3055        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3056            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3057                                         fEX->contains(thisChar)  ||
3058            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3059            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3060            continue;
3061        }
3062
3063        // LB 14 Don't break after OP SP*
3064        //       Scan backwards, checking for this sequence.
3065        //       The OP char could include combining marks, so we actually check for
3066        //           OP CM* SP*
3067        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3068        //       sequence into a ID char, so before scanning back through spaces,
3069        //       verify that prevChar is indeed a space.  The prevChar variable
3070        //       may differ from fText[prevPos]
3071        tPos = prevPos;
3072        if (fSP->contains(prevChar)) {
3073            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3074                tPos=fText->moveIndex32(tPos, -1);
3075            }
3076        }
3077        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3078            tPos=fText->moveIndex32(tPos, -1);
3079        }
3080        if (fOP->contains(fText->char32At(tPos))) {
3081            continue;
3082        }
3083
3084
3085        // LB 15    QU SP* x OP
3086        if (fOP->contains(thisChar)) {
3087            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3088            int tPos = prevPos;
3089            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3090                tPos = fText->moveIndex32(tPos, -1);
3091            }
3092            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3093                tPos = fText->moveIndex32(tPos, -1);
3094            }
3095            if (fQU->contains(fText->char32At(tPos))) {
3096                continue;
3097            }
3098        }
3099
3100
3101
3102        // LB 16   (CL | CP) SP* x NS
3103        //    Scan backwards for SP* CM* (CL | CP)
3104        if (fNS->contains(thisChar)) {
3105            int tPos = prevPos;
3106            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3107                tPos = fText->moveIndex32(tPos, -1);
3108            }
3109            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3110                tPos = fText->moveIndex32(tPos, -1);
3111            }
3112            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3113                continue;
3114            }
3115        }
3116
3117
3118        // LB 17        B2 SP* x B2
3119        if (fB2->contains(thisChar)) {
3120            //  Scan backwards, checking for the B2 CM* SP* sequence.
3121            tPos = prevPos;
3122            if (fSP->contains(prevChar)) {
3123                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3124                    tPos=fText->moveIndex32(tPos, -1);
3125                }
3126            }
3127            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3128                tPos=fText->moveIndex32(tPos, -1);
3129            }
3130            if (fB2->contains(fText->char32At(tPos))) {
3131                continue;
3132            }
3133        }
3134
3135
3136        // LB 18    break after space
3137        if (fSP->contains(prevChar)) {
3138            break;
3139        }
3140
3141        // LB 19
3142        //    x   QU
3143        //    QU  x
3144        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3145            continue;
3146        }
3147
3148        // LB 20  Break around a CB
3149        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3150            break;
3151        }
3152
3153        // LB 21
3154        if (fBA->contains(thisChar) ||
3155            fHY->contains(thisChar) ||
3156            fNS->contains(thisChar) ||
3157            fBB->contains(prevChar) )   {
3158            continue;
3159        }
3160
3161        // LB 21a
3162        //   HL (HY | BA) x
3163        if (fHL->contains(prevCharX2) &&
3164                (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3165            continue;
3166        }
3167
3168        // LB 21b - Added for Apple 13927604
3169        if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3170            continue;
3171        }
3172
3173        // LB 22
3174        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3175            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3176            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3177            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3178            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3179            continue;
3180        }
3181
3182
3183        // LB 23    ID x PO
3184        //          AL x NU
3185        //          HL x NU
3186        //          NU x AL
3187        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3188            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3189            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3190            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3191            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3192            continue;
3193        }
3194
3195        // LB 24  Do not break between prefix and letters or ideographs.
3196        //        PR x ID
3197        //        PR x (AL | HL)
3198        //        PO x (AL | HL)
3199        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3200            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3201            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3202            continue;
3203        }
3204
3205
3206
3207        // LB 25    Numbers
3208        if (fNumberMatcher->lookingAt(prevPos, status)) {
3209            if (U_FAILURE(status)) {
3210                break;
3211            }
3212            // Matched a number.  But could have been just a single digit, which would
3213            //    not represent a "no break here" between prevChar and thisChar
3214            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3215            if (numEndIdx > pos) {
3216                // Number match includes at least our two chars being checked
3217                if (numEndIdx > nextPos) {
3218                    // Number match includes additional chars.  Update pos and nextPos
3219                    //   so that next loop iteration will continue at the end of the number,
3220                    //   checking for breaks between last char in number & whatever follows.
3221                    pos = nextPos = numEndIdx;
3222                    do {
3223                        pos = fText->moveIndex32(pos, -1);
3224                        thisChar = fText->char32At(pos);
3225                    } while (fCM->contains(thisChar));
3226                }
3227                continue;
3228            }
3229        }
3230
3231
3232        // LB 26 Do not break a Korean syllable.
3233        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234                                        fJV->contains(thisChar) ||
3235                                        fH2->contains(thisChar) ||
3236                                        fH3->contains(thisChar))) {
3237                                            continue;
3238                                        }
3239
3240        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3241            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3242                continue;
3243        }
3244
3245        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3246            fJT->contains(thisChar)) {
3247                continue;
3248        }
3249
3250        // LB 27 Treat a Korean Syllable Block the same as ID.
3251        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3252            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3253            fIN->contains(thisChar)) {
3254                continue;
3255            }
3256        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3257            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3258            fPO->contains(thisChar)) {
3259                continue;
3260            }
3261        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3262            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3263                continue;
3264            }
3265
3266
3267
3268        // LB 28  Do not break between alphabetics ("at").
3269        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3270            continue;
3271        }
3272
3273        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3274        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3275            continue;
3276        }
3277
3278        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3279        //          (AL | NU) x OP
3280        //          CP x (AL | NU)
3281        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3282            continue;
3283        }
3284        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3285            continue;
3286        }
3287
3288        // LB30a  Do not break between regional indicators.
3289        //        RI x RI
3290        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3291            continue;
3292        }
3293
3294        // LB 31    Break everywhere else
3295        break;
3296
3297    }
3298
3299    return pos;
3300}
3301
3302
3303UVector  *RBBILineMonkey::charClasses() {
3304    return fSets;
3305}
3306
3307
3308RBBILineMonkey::~RBBILineMonkey() {
3309    delete fSets;
3310
3311    delete fBK;
3312    delete fCR;
3313    delete fLF;
3314    delete fCM;
3315    delete fNL;
3316    delete fWJ;
3317    delete fZW;
3318    delete fGL;
3319    delete fCB;
3320    delete fSP;
3321    delete fB2;
3322    delete fBA;
3323    delete fBB;
3324    delete fHY;
3325    delete fH2;
3326    delete fH3;
3327    delete fCL;
3328    delete fCP;
3329    delete fEX;
3330    delete fIN;
3331    delete fJL;
3332    delete fJV;
3333    delete fJT;
3334    delete fNS;
3335    delete fOP;
3336    delete fQU;
3337    delete fIS;
3338    delete fNU;
3339    delete fPO;
3340    delete fPR;
3341    delete fSY;
3342    delete fAI;
3343    delete fAL;
3344    delete fCJ;
3345    delete fHL;
3346    delete fID;
3347    delete fRI;
3348    delete fSA;
3349    delete fSG;
3350    delete fXX;
3351
3352    delete fCharBI;
3353    delete fNumberMatcher;
3354}
3355
3356
3357//-------------------------------------------------------------------------------------------
3358//
3359//   TestMonkey
3360//
3361//     params
3362//       seed=nnnnn        Random number starting seed.
3363//                         Setting the seed allows errors to be reproduced.
3364//       loop=nnn          Looping count.  Controls running time.
3365//                         -1:  run forever.
3366//                          0 or greater:  run length.
3367//
3368//       type = char | word | line | sent | title
3369//
3370//-------------------------------------------------------------------------------------------
3371
3372static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3373    int32_t val = defaultVal;
3374    name.append(" *= *(-?\\d+)");
3375    UErrorCode status = U_ZERO_ERROR;
3376    RegexMatcher m(name, params, 0, status);
3377    if (m.find()) {
3378        // The param exists.  Convert the string to an int.
3379        char valString[100];
3380        int32_t paramLength = m.end(1, status) - m.start(1, status);
3381        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3382            paramLength = (int32_t)(sizeof(valString)-2);
3383        }
3384        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3385        val = strtol(valString,  NULL, 10);
3386
3387        // Delete this parameter from the params string.
3388        m.reset();
3389        params = m.replaceFirst("", status);
3390    }
3391    U_ASSERT(U_SUCCESS(status));
3392    return val;
3393}
3394#endif
3395
3396#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3397static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3398                                    BreakIterator *bi,
3399                                    int expected[],
3400                                    int expectedcount)
3401{
3402    int count = 0;
3403    int i = 0;
3404    int forward[50];
3405    bi->setText(ustr);
3406    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3407        forward[count] = i;
3408        if (count < expectedcount && expected[count] != i) {
3409            test->errln("break forward test failed: expected %d but got %d",
3410                        expected[count], i);
3411            break;
3412        }
3413        count ++;
3414    }
3415    if (count != expectedcount) {
3416        printStringBreaks(ustr, expected, expectedcount);
3417        test->errln("break forward test failed: missed %d match",
3418                    expectedcount - count);
3419        return;
3420    }
3421    // testing boundaries
3422    for (i = 1; i < expectedcount; i ++) {
3423        int j = expected[i - 1];
3424        if (!bi->isBoundary(j)) {
3425            printStringBreaks(ustr, expected, expectedcount);
3426            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3427            return;
3428        }
3429        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3430            if (bi->isBoundary(j)) {
3431                printStringBreaks(ustr, expected, expectedcount);
3432                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3433                return;
3434            }
3435        }
3436    }
3437
3438    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3439        count --;
3440        if (forward[count] != i) {
3441            printStringBreaks(ustr, expected, expectedcount);
3442            test->errln("happy break test previous() failed: expected %d but got %d",
3443                        forward[count], i);
3444            break;
3445        }
3446    }
3447    if (count != 0) {
3448        printStringBreaks(ustr, expected, expectedcount);
3449        test->errln("break test previous() failed: missed a match");
3450        return;
3451    }
3452
3453    // testing preceding
3454    for (i = 0; i < expectedcount - 1; i ++) {
3455        // int j = expected[i] + 1;
3456        int j = ustr.moveIndex32(expected[i], 1);
3457        for (; j <= expected[i + 1]; j ++) {
3458            if (bi->preceding(j) != expected[i]) {
3459                printStringBreaks(ustr, expected, expectedcount);
3460                test->errln("preceding(): Not expecting boundary at position %d", j);
3461                return;
3462            }
3463        }
3464    }
3465}
3466#endif
3467
3468void RBBITest::TestWordBreaks(void)
3469{
3470#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3471
3472    Locale        locale("en");
3473    UErrorCode    status = U_ZERO_ERROR;
3474    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3475    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3476    // Replaced any C+J characters in a row with a random sequence of characters
3477    // of the same length to make our C+J segmentation not get in the way.
3478    static const char *strlist[] =
3479    {
3480    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3481    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3482    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3483    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3484    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3485    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3486    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3487    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3488    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3489    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3490    "\\u2027\\U000e0067\\u0a47\\u00b7",
3491    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3492    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3493    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3494    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3495    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3496    "\\u0027\\u11af\\U000e0057\\u0602",
3497    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3498    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3499    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3500    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3501    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3502    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3503    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3504    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3505    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3506    "\\u18f4\\U000e0049\\u20e7\\u2027",
3507    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3508    "\\ua183\\u102d\\u0bec\\u003a",
3509    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3510    "\\u003a\\u0e57\\u0fad\\u002e",
3511    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3512    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3513    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3514    "\\u003a\\u0664\\u00b7\\u1fba",
3515    "\\u003b\\u0027\\u00b7\\u47a3",
3516    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3517    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3518    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3519    };
3520    int loop;
3521    if (U_FAILURE(status)) {
3522        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3523        return;
3524    }
3525    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3526        // printf("looping %d\n", loop);
3527        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3528        // RBBICharMonkey monkey;
3529        RBBIWordMonkey monkey;
3530
3531        int expected[50];
3532        int expectedcount = 0;
3533
3534        monkey.setText(ustr);
3535        int i;
3536        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3537            expected[expectedcount ++] = i;
3538        }
3539
3540        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3541    }
3542    delete bi;
3543#endif
3544}
3545
3546void RBBITest::TestWordBoundary(void)
3547{
3548    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3549    Locale        locale("en");
3550    UErrorCode    status = U_ZERO_ERROR;
3551    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3552    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3553    UChar         str[50];
3554    static const char *strlist[] =
3555    {
3556    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3557    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3558    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3559    "\\u2027\\U000e0067\\u0a47\\u00b7",
3560    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3561    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3562    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3563    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3564    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3565    "\\u0027\\u11af\\U000e0057\\u0602",
3566    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3567    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3568    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3569    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3570    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3571    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3572    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3573    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3574    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3575    "\\u58f4\\U000e0049\\u20e7\\u2027",
3576    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3577    "\\ua183\\u102d\\u0bec\\u003a",
3578    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3579    "\\u003a\\u0e57\\u0fad\\u002e",
3580    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3581    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3582    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3583    "\\u003a\\u0664\\u00b7\\u1fba",
3584    "\\u003b\\u0027\\u00b7\\u47a3",
3585    };
3586    int loop;
3587    if (U_FAILURE(status)) {
3588        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3589        return;
3590    }
3591    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3592        // printf("looping %d\n", loop);
3593        u_unescape(strlist[loop], str, 20);
3594        UnicodeString ustr(str);
3595        int forward[50];
3596        int count = 0;
3597
3598        bi->setText(ustr);
3599        int prev = 0;
3600        int i;
3601        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3602            forward[count ++] = i;
3603            if (i > prev) {
3604                int j;
3605                for (j = prev + 1; j < i; j ++) {
3606                    if (bi->isBoundary(j)) {
3607                        printStringBreaks(ustr, forward, count);
3608                        errln("happy boundary test failed: expected %d not a boundary",
3609                               j);
3610                        return;
3611                    }
3612                }
3613            }
3614            if (!bi->isBoundary(i)) {
3615                printStringBreaks(ustr, forward, count);
3616                errln("happy boundary test failed: expected %d a boundary",
3617                       i);
3618                return;
3619            }
3620            prev = i;
3621        }
3622    }
3623    delete bi;
3624}
3625
3626void RBBITest::TestLineBreaks(void)
3627{
3628#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3629    Locale        locale("en");
3630    UErrorCode    status = U_ZERO_ERROR;
3631    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3632    const int32_t  STRSIZE = 50;
3633    UChar         str[STRSIZE];
3634    static const char *strlist[] =
3635    {
3636     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3637     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3638             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3639     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3640             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3641     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3642     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3643     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3644     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3645     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3646     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3647     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3648     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3649     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3650     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3651     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3652     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3653     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3654     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3655     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3656     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3657     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3658     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3659     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3660     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3661     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3662     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3663     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3664     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3665     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3666     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3667     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3668     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3669     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3670     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3671     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3672     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3673     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3674     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3675     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3676     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3677     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3678         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3679         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3680         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3681     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3682         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3683    };
3684    int loop;
3685    TEST_ASSERT_SUCCESS(status);
3686    if (U_FAILURE(status)) {
3687        return;
3688    }
3689    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3690        // printf("looping %d\n", loop);
3691        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3692        if (t >= STRSIZE) {
3693            TEST_ASSERT(FALSE);
3694            continue;
3695        }
3696
3697
3698        UnicodeString ustr(str);
3699        RBBILineMonkey monkey;
3700        if (U_FAILURE(monkey.deferredStatus)) {
3701            continue;
3702        }
3703
3704        const int EXPECTEDSIZE = 50;
3705        int expected[EXPECTEDSIZE];
3706        int expectedcount = 0;
3707
3708        monkey.setText(ustr);
3709        int i;
3710        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3711            if (expectedcount >= EXPECTEDSIZE) {
3712                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3713                return;
3714            }
3715            expected[expectedcount ++] = i;
3716        }
3717
3718        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3719    }
3720    delete bi;
3721#endif
3722}
3723
3724void RBBITest::TestSentBreaks(void)
3725{
3726#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727    Locale        locale("en");
3728    UErrorCode    status = U_ZERO_ERROR;
3729    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3730    UChar         str[200];
3731    static const char *strlist[] =
3732    {
3733     "Now\ris\nthe\r\ntime\n\rfor\r\r",
3734     "This\n",
3735     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3736     "\"Sentence ending with a quote.\" Bye.",
3737     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3738     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3739     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3740     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3741     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3742     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3743     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3744             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3745             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3746             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3747     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3748             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3749             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3750             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3751             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3752             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3753    };
3754    int loop;
3755    if (U_FAILURE(status)) {
3756        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3757        return;
3758    }
3759    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3760        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3761        UnicodeString ustr(str);
3762
3763        RBBISentMonkey monkey;
3764        if (U_FAILURE(monkey.deferredStatus)) {
3765            continue;
3766        }
3767
3768        const int EXPECTEDSIZE = 50;
3769        int expected[EXPECTEDSIZE];
3770        int expectedcount = 0;
3771
3772        monkey.setText(ustr);
3773        int i;
3774        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3775            if (expectedcount >= EXPECTEDSIZE) {
3776                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3777                return;
3778            }
3779            expected[expectedcount ++] = i;
3780        }
3781
3782        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3783    }
3784    delete bi;
3785#endif
3786}
3787
3788void RBBITest::TestMonkey(char *params) {
3789#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3790
3791    UErrorCode     status    = U_ZERO_ERROR;
3792    int32_t        loopCount = 500;
3793    int32_t        seed      = 1;
3794    UnicodeString  breakType = "all";
3795    Locale         locale("en");
3796    UBool          useUText  = FALSE;
3797
3798    if (quick == FALSE) {
3799        loopCount = 10000;
3800    }
3801
3802    if (params) {
3803        UnicodeString p(params);
3804        loopCount = getIntParam("loop", p, loopCount);
3805        seed      = getIntParam("seed", p, seed);
3806
3807        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3808        if (m.find()) {
3809            breakType = m.group(1, status);
3810            m.reset();
3811            p = m.replaceFirst("", status);
3812        }
3813
3814        RegexMatcher u(" *utext", p, 0, status);
3815        if (u.find()) {
3816            useUText = TRUE;
3817            u.reset();
3818            p = u.replaceFirst("", status);
3819        }
3820
3821
3822        // m.reset(p);
3823        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3824            // Each option is stripped out of the option string as it is processed.
3825            // All options have been checked.  The option string should have been completely emptied..
3826            char buf[100];
3827            p.extract(buf, sizeof(buf), NULL, status);
3828            buf[sizeof(buf)-1] = 0;
3829            errln("Unrecognized or extra parameter:  %s\n", buf);
3830            return;
3831        }
3832
3833    }
3834
3835    if (breakType == "char" || breakType == "all") {
3836        RBBICharMonkey  m;
3837        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3838        if (U_SUCCESS(status)) {
3839            RunMonkey(bi, m, "char", seed, loopCount, useUText);
3840            if (breakType == "all" && useUText==FALSE) {
3841                // Also run a quick test with UText when "all" is specified
3842                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3843            }
3844        }
3845        else {
3846            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3847        }
3848        delete bi;
3849    }
3850
3851    if (breakType == "word" || breakType == "all") {
3852        logln("Word Break Monkey Test");
3853        RBBIWordMonkey  m;
3854        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3855        if (U_SUCCESS(status)) {
3856            RunMonkey(bi, m, "word", seed, loopCount, useUText);
3857        }
3858        else {
3859            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3860        }
3861        delete bi;
3862    }
3863
3864    if (breakType == "line" || breakType == "all") {
3865        logln("Line Break Monkey Test");
3866        RBBILineMonkey  m;
3867        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3868        if (loopCount >= 10) {
3869            loopCount = loopCount / 5;   // Line break runs slower than the others.
3870        }
3871        if (U_SUCCESS(status)) {
3872            RunMonkey(bi, m, "line", seed, loopCount, useUText);
3873        }
3874        else {
3875            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3876        }
3877        delete bi;
3878    }
3879
3880    if (breakType == "sent" || breakType == "all"  ) {
3881        logln("Sentence Break Monkey Test");
3882        RBBISentMonkey  m;
3883        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3884        if (loopCount >= 10) {
3885            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3886        }
3887        if (U_SUCCESS(status)) {
3888            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3889        }
3890        else {
3891            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3892        }
3893        delete bi;
3894    }
3895
3896#endif
3897}
3898
3899//
3900//  Run a RBBI monkey test.  Common routine, for all break iterator types.
3901//    Parameters:
3902//       bi      - the break iterator to use
3903//       mk      - MonkeyKind, abstraction for obtaining expected results
3904//       name    - Name of test (char, word, etc.) for use in error messages
3905//       seed    - Seed for starting random number generator (parameter from user)
3906//       numIterations
3907//
3908void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3909                         int32_t numIterations, UBool useUText) {
3910
3911#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3912
3913    const int32_t    TESTSTRINGLEN = 500;
3914    UnicodeString    testText;
3915    int32_t          numCharClasses;
3916    UVector          *chClasses;
3917    int              expected[TESTSTRINGLEN*2 + 1];
3918    int              expectedCount = 0;
3919    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3920    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3921    char             reverseBreaks[TESTSTRINGLEN*2+1];
3922    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3923    char             followingBreaks[TESTSTRINGLEN*2+1];
3924    char             precedingBreaks[TESTSTRINGLEN*2+1];
3925    int              i;
3926    int              loopCount = 0;
3927
3928    m_seed = seed;
3929
3930    numCharClasses = mk.charClasses()->size();
3931    chClasses      = mk.charClasses();
3932
3933    // Check for errors that occured during the construction of the MonkeyKind object.
3934    //  Can't report them where they occured because errln() is a method coming from intlTest,
3935    //  and is not visible outside of RBBITest :-(
3936    if (U_FAILURE(mk.deferredStatus)) {
3937        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3938        return;
3939    }
3940
3941    // Verify that the character classes all have at least one member.
3942    for (i=0; i<numCharClasses; i++) {
3943        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3944        if (s == NULL || s->size() == 0) {
3945            errln("Character Class #%d is null or of zero size.", i);
3946            return;
3947        }
3948    }
3949
3950    while (loopCount < numIterations || numIterations == -1) {
3951        if (numIterations == -1 && loopCount % 10 == 0) {
3952            // If test is running in an infinite loop, display a periodic tic so
3953            //   we can tell that it is making progress.
3954            fprintf(stderr, ".");
3955        }
3956        // Save current random number seed, so that we can recreate the random numbers
3957        //   for this loop iteration in event of an error.
3958        seed = m_seed;
3959
3960        // Populate a test string with data.
3961        testText.truncate(0);
3962        for (i=0; i<TESTSTRINGLEN; i++) {
3963            int32_t  aClassNum = m_rand() % numCharClasses;
3964            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3965            int32_t   charIdx = m_rand() % classSet->size();
3966            UChar32   c = classSet->charAt(charIdx);
3967            if (c < 0) {   // TODO:  deal with sets containing strings.
3968                errln("c < 0");
3969                break;
3970            }
3971            testText.append(c);
3972        }
3973
3974        // Calculate the expected results for this test string.
3975        mk.setText(testText);
3976        memset(expectedBreaks, 0, sizeof(expectedBreaks));
3977        expectedBreaks[0] = 1;
3978        int32_t breakPos = 0;
3979        expectedCount = 0;
3980        for (;;) {
3981            breakPos = mk.next(breakPos);
3982            if (breakPos == -1) {
3983                break;
3984            }
3985            if (breakPos > testText.length()) {
3986                errln("breakPos > testText.length()");
3987            }
3988            expectedBreaks[breakPos] = 1;
3989            U_ASSERT(expectedCount<testText.length());
3990            expected[expectedCount ++] = breakPos;
3991        }
3992
3993        // Find the break positions using forward iteration
3994        memset(forwardBreaks, 0, sizeof(forwardBreaks));
3995        if (useUText) {
3996            UErrorCode status = U_ZERO_ERROR;
3997            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3998            // testUText = utext_openUnicodeString(testUText, &testText, &status);
3999            bi->setText(testUText, status);
4000            TEST_ASSERT_SUCCESS(status);
4001            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4002                                      //  This UText can be closed immediately, so long as the
4003                                      //  testText string continues to exist.
4004        } else {
4005            bi->setText(testText);
4006        }
4007
4008        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4009            if (i < 0 || i > testText.length()) {
4010                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4011                break;
4012            }
4013            forwardBreaks[i] = 1;
4014        }
4015
4016        // Find the break positions using reverse iteration
4017        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4018        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4019            if (i < 0 || i > testText.length()) {
4020                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4021                break;
4022            }
4023            reverseBreaks[i] = 1;
4024        }
4025
4026        // Find the break positions using isBoundary() tests.
4027        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4028        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4029        for (i=0; i<=testText.length(); i++) {
4030            isBoundaryBreaks[i] = bi->isBoundary(i);
4031        }
4032
4033
4034        // Find the break positions using the following() function.
4035        // printf(".");
4036        memset(followingBreaks, 0, sizeof(followingBreaks));
4037        int32_t   lastBreakPos = 0;
4038        followingBreaks[0] = 1;
4039        for (i=0; i<testText.length(); i++) {
4040            breakPos = bi->following(i);
4041            if (breakPos <= i ||
4042                breakPos < lastBreakPos ||
4043                breakPos > testText.length() ||
4044                (breakPos > lastBreakPos && lastBreakPos > i)) {
4045                UChar32 brkChar = testText.char32At(lastBreakPos);
4046                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4047                errln("%s break monkey test: "
4048                    "Out of range value returned by BreakIterator::following().\n"
4049                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4050                         name, seed, i, breakPos, lastBreakPos);
4051                }
4052                break;
4053            }
4054            followingBreaks[breakPos] = 1;
4055            lastBreakPos = breakPos;
4056        }
4057
4058        // Find the break positions using the preceding() function.
4059        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4060        lastBreakPos = testText.length();
4061        precedingBreaks[testText.length()] = 1;
4062        for (i=testText.length(); i>0; i--) {
4063            breakPos = bi->preceding(i);
4064            if (breakPos >= i ||
4065                breakPos > lastBreakPos ||
4066                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4067                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4068                UChar32 brkChar = testText.char32At(breakPos);
4069                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4070                errln("%s break monkey test: "
4071                    "Out of range value returned by BreakIterator::preceding().\n"
4072                    "index=%d;  prev returned %d; lastBreak=%d" ,
4073                    name,  i, breakPos, lastBreakPos);
4074                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4075                    precedingBreaks[i] = 2;   // Forces an error.
4076                }
4077                }
4078            } else {
4079                if (breakPos >= 0) {
4080                    precedingBreaks[breakPos] = 1;
4081                }
4082                lastBreakPos = breakPos;
4083            }
4084        }
4085
4086        // Compare the expected and actual results.
4087        for (i=0; i<=testText.length(); i++) {
4088            const char *errorType = NULL;
4089            if  (forwardBreaks[i] != expectedBreaks[i]) {
4090                errorType = "next()";
4091            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4092                errorType = "previous()";
4093            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4094                errorType = "isBoundary()";
4095            } else if (followingBreaks[i] != expectedBreaks[i]) {
4096                errorType = "following()";
4097            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4098                errorType = "preceding()";
4099            }
4100
4101
4102            if (errorType != NULL) {
4103                // Format a range of the test text that includes the failure as
4104                //  a data item that can be included in the rbbi test data file.
4105
4106                // Start of the range is the last point where expected and actual results
4107                //   both agreed that there was a break position.
4108                int startContext = i;
4109                int32_t count = 0;
4110                for (;;) {
4111                    if (startContext==0) { break; }
4112                    startContext --;
4113                    if (expectedBreaks[startContext] != 0) {
4114                        if (count == 2) break;
4115                        count ++;
4116                    }
4117                }
4118
4119                // End of range is two expected breaks past the start position.
4120                int endContext = i + 1;
4121                int ci;
4122                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4123                    for (;;) {
4124                        if (endContext >= testText.length()) {break;}
4125                        if (expectedBreaks[endContext-1] != 0) {
4126                            if (count == 0) break;
4127                            count --;
4128                        }
4129                        endContext ++;
4130                    }
4131                }
4132
4133                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4134                UnicodeString errorText = "<data>";
4135                /***if (strcmp(errorType, "next()") == 0) {
4136                    startContext = 0;
4137                    endContext = testText.length();
4138
4139                    printStringBreaks(testText, expected, expectedCount);
4140                }***/
4141
4142                for (ci=startContext; ci<endContext;) {
4143                    UnicodeString hexChars("0123456789abcdef");
4144                    UChar32  c;
4145                    int      bn;
4146                    c = testText.char32At(ci);
4147                    if (ci == i) {
4148                        // This is the location of the error.
4149                        errorText.append("<?>");
4150                    } else if (expectedBreaks[ci] != 0) {
4151                        // This a non-error expected break position.
4152                        errorText.append("\\");
4153                    }
4154                    if (c < 0x10000) {
4155                        errorText.append("\\u");
4156                        for (bn=12; bn>=0; bn-=4) {
4157                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4158                        }
4159                    } else {
4160                        errorText.append("\\U");
4161                        for (bn=28; bn>=0; bn-=4) {
4162                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4163                        }
4164                    }
4165                    ci = testText.moveIndex32(ci, 1);
4166                }
4167                errorText.append("\\");
4168                errorText.append("</data>\n");
4169
4170                // Output the error
4171                char  charErrorTxt[500];
4172                UErrorCode status = U_ZERO_ERROR;
4173                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4174                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4175                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4176
4177                UChar32 brkChar = testText.char32At(i);
4178                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4179                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4180                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4181                    errorType, seed, i, charErrorTxt);
4182                }
4183                break;
4184            }
4185        }
4186
4187        loopCount++;
4188    }
4189#endif
4190}
4191
4192
4193//  Bug 5532.  UTF-8 based UText fails in dictionary code.
4194//             This test checks the initial patch,
4195//             which is to just keep it from crashing.  Correct word boundaries
4196//             await a proper fix to the dictionary code.
4197//
4198void RBBITest::TestBug5532(void)  {
4199   // Text includes a mixture of Thai and Latin.
4200   const unsigned char utf8Data[] = {
4201           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4202           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4203           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4204           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4205           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4206           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4207           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4208           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4209           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4210           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4211           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4212
4213    UErrorCode status = U_ZERO_ERROR;
4214    UText utext=UTEXT_INITIALIZER;
4215    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4216    TEST_ASSERT_SUCCESS(status);
4217
4218    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4219    TEST_ASSERT_SUCCESS(status);
4220    if (U_SUCCESS(status)) {
4221        bi->setText(&utext, status);
4222        TEST_ASSERT_SUCCESS(status);
4223
4224        int32_t breakCount = 0;
4225        int32_t previousBreak = -1;
4226        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4227            // For now, just make sure that the break iterator doesn't hang.
4228            TEST_ASSERT(previousBreak < bi->current());
4229            previousBreak = bi->current();
4230        }
4231        TEST_ASSERT(breakCount > 0);
4232    }
4233    delete bi;
4234    utext_close(&utext);
4235}
4236
4237
4238void RBBITest::TestBug9983(void)  {
4239    UnicodeString text = UnicodeString("\\u002A"  // * Other
4240                                       "\\uFF65"  //   Other
4241                                       "\\u309C"  //   Katakana
4242                                       "\\uFF9F"  //   Extend
4243                                       "\\uFF65"  //   Other
4244                                       "\\u0020"  //   Other
4245                                       "\\u0000").unescape();
4246
4247    UErrorCode status = U_ZERO_ERROR;
4248    LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4249        BreakIterator::createWordInstance(Locale::getRoot(), status)));
4250    TEST_ASSERT_SUCCESS(status);
4251    if (U_FAILURE(status)) {
4252        return;
4253    }
4254    brkiter->setText(text);
4255    int32_t offset, rstatus;
4256    brkiter->last();
4257    int32_t iterationCount = 0;
4258    while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4259        iterationCount++;
4260        rstatus = brkiter->getRuleStatus();
4261        // printf(" %d(%d)", offset, rstatus);
4262        if (iterationCount >= 10) {
4263           break;
4264        }
4265    }
4266    TEST_ASSERT(iterationCount == 6);
4267}
4268
4269
4270//
4271//  TestDebug    -  A place-holder test for debugging purposes.
4272//                  For putting in fragments of other tests that can be invoked
4273//                  for tracing  without a lot of unwanted extra stuff happening.
4274//
4275void RBBITest::TestDebug(void) {
4276#if 0
4277    UErrorCode   status = U_ZERO_ERROR;
4278    int pos = 0;
4279    int ruleStatus = 0;
4280
4281    RuleBasedBreakIterator* bi =
4282       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4283       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4284       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4285    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4286    // UnicodeString s("Aaa.  Bcd");
4287    s = s.unescape();
4288    bi->setText(s);
4289    UBool r = bi->isBoundary(8);
4290    printf("%s", r?"true":"false");
4291    return;
4292    pos = bi->last();
4293    do {
4294        // ruleStatus = bi->getRuleStatus();
4295        printf("%d\t%d\n", pos, ruleStatus);
4296        pos = bi->previous();
4297    } while (pos != BreakIterator::DONE);
4298#endif
4299}
4300
4301void RBBITest::TestProperties() {
4302    UErrorCode errorCode = U_ZERO_ERROR;
4303    UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4304    if (!prependSet.isEmpty()) {
4305        errln(
4306            "[:GCB=Prepend:] is not empty any more. "
4307            "Uncomment relevant lines in source/data/brkitr/char.txt and "
4308            "change this test to the opposite condition.");
4309    }
4310}
4311
4312#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4313