1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
36#include <string.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
41
42#define TEST_ASSERT(x) {if (!(x)) { \
43    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47
48
49//---------------------------------------------
50// runIndexedTest
51//---------------------------------------------
52
53
54//  Note:  Before adding new tests to this file, check whether the desired test data can
55//         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
56//         it's much less work than writing a new test, diagnostic output in the event of failures
57//         is good, and the test data file will is shared with ICU4J, so eventually the test
58//         will run there as well, without additional effort.
59
60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61{
62    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64    switch (index) {
65#if !UCONFIG_NO_FILE_IO
66        case 0: name = "TestBug4153072";
67            if(exec) TestBug4153072();                         break;
68#else
69        case 0: name = "skip";
70            break;
71#endif
72
73        case 1: name = "skip";
74            break;
75        case 2: name = "TestStatusReturn";
76            if(exec) TestStatusReturn();                       break;
77
78#if !UCONFIG_NO_FILE_IO
79        case 3: name = "TestUnicodeFiles";
80            if(exec) TestUnicodeFiles();                       break;
81        case 4: name = "TestEmptyString";
82            if(exec) TestEmptyString();                        break;
83#else
84        case 3: case 4: name = "skip";
85            break;
86#endif
87
88        case 5: name = "TestGetAvailableLocales";
89            if(exec) TestGetAvailableLocales();                break;
90
91        case 6: name = "TestGetDisplayName";
92            if(exec) TestGetDisplayName();                     break;
93
94#if !UCONFIG_NO_FILE_IO
95        case 7: name = "TestEndBehaviour";
96            if(exec) TestEndBehaviour();                       break;
97        case 8: case 9: case 10: name = "skip";
98             break;
99        case 11: name = "TestWordBreaks";
100             if(exec) TestWordBreaks();                        break;
101        case 12: name = "TestWordBoundary";
102             if(exec) TestWordBoundary();                      break;
103        case 13: name = "TestLineBreaks";
104             if(exec) TestLineBreaks();                        break;
105        case 14: name = "TestSentBreaks";
106             if(exec) TestSentBreaks();                        break;
107        case 15: name = "TestExtended";
108             if(exec) TestExtended();                          break;
109#else
110        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111             break;
112#endif
113
114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115        case 16:
116            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
117#else
118        case 16:
119             name = "skip";                                    break;
120#endif
121
122#if !UCONFIG_NO_FILE_IO
123        case 17: name = "TestBug3818";
124            if(exec) TestBug3818();                            break;
125#else
126        case 17: name = "skip";
127            break;
128#endif
129
130        case 18: name = "skip";
131            break;
132        case 19: name = "TestDebug";
133            if(exec) TestDebug();                              break;
134        case 20: name = "skip";
135            break;
136
137#if !UCONFIG_NO_FILE_IO
138        case 21: name = "TestBug5775";
139            if (exec) TestBug5775();                           break;
140#else
141        case 21: name = "skip";
142            break;
143#endif
144
145        case 22: name = "TestBug9983";
146            if (exec) TestBug9983();                           break;
147        case 23: name = "TestDictRules";
148            if (exec) TestDictRules();                         break;
149        case 24: name = "TestBug5532";
150            if (exec) TestBug5532();                           break;
151        default: name = ""; break; //needed to end loop
152    }
153}
154
155
156//---------------------------------------------------------------------------
157//
158//   class BITestData   Holds a set of Break iterator test data and results
159//                      Includes
160//                         - the string data to be broken
161//                         - a vector of the expected break positions.
162//                         - a vector of source line numbers for the data,
163//                               (to help see where errors occured.)
164//                         - The expected break tag values.
165//                         - Vectors of actual break positions and tag values.
166//                         - Functions for comparing actual with expected and
167//                            reporting errors.
168//
169//----------------------------------------------------------------------------
170class BITestData {
171public:
172    UnicodeString    fDataToBreak;
173    UVector          fExpectedBreakPositions;
174    UVector          fExpectedTags;
175    UVector          fLineNum;
176    UVector          fActualBreakPositions;   // Test Results.
177    UVector          fActualTags;
178
179    BITestData(UErrorCode &status);
180    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181    void             checkResults(const char *heading, RBBITest *test);
182    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183    void             clearResults();
184};
185
186//
187// Constructor.
188//
189BITestData::BITestData(UErrorCode &status)
190: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
191  fActualTags(status)
192{
193}
194
195//
196// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
197//                 The macro form collects the line number, which is helpful
198//                 when tracking down failures.
199//
200//                 A null data item is inserted at the start of each test's data
201//                  to put the starting zero into the data list.  The position saved for
202//                  each non-null item is its ending position.
203//
204#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206    if (U_FAILURE(status)) {return;}
207    if (data != NULL) {
208        fDataToBreak.append(CharsToUnicodeString(data));
209    }
210    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211    fExpectedTags.addElement(tag, status);
212    fLineNum.addElement(lineNum, status);
213}
214
215
216//
217//  checkResults.   Compare the actual and expected break positions, report any differences.
218//
219void BITestData::checkResults(const char *heading, RBBITest *test) {
220    int32_t   expectedIndex = 0;
221    int32_t   actualIndex = 0;
222
223    for (;;) {
224        // If we've run through both the expected and actual results vectors, we're done.
225        //   break out of the loop.
226        if (expectedIndex >= fExpectedBreakPositions.size() &&
227            actualIndex   >= fActualBreakPositions.size()) {
228            break;
229        }
230
231
232        if (expectedIndex >= fExpectedBreakPositions.size()) {
233            err(heading, test, expectedIndex-1, actualIndex);
234            actualIndex++;
235            continue;
236        }
237
238        if (actualIndex >= fActualBreakPositions.size()) {
239            err(heading, test, expectedIndex, actualIndex-1);
240            expectedIndex++;
241            continue;
242        }
243
244        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245            err(heading, test, expectedIndex, actualIndex);
246            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248                actualIndex++;
249            } else {
250                expectedIndex++;
251            }
252            continue;
253        }
254
255        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
257                heading, fLineNum.elementAt(expectedIndex),
258                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259        }
260
261        actualIndex++;
262        expectedIndex++;
263    }
264}
265
266//
267//  err   -  An error was found.  Report it, along with information about where the
268//                                incorrectly broken test data appeared in the source file.
269//
270void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271{
272    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
273    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
274    int32_t   o        = 0;
275    int32_t   line     = fLineNum.elementAti(expectedIdx);
276    if (expectedIdx > 0) {
277        // The line numbers are off by one because a premature break occurs somewhere
278        //    within the previous item, rather than at the start of the current (expected) item.
279        //    We want to report the offset of the unexpected break from the start of
280        //      this previous item.
281        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282    }
283    if (actual < expected) {
284        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
285    } else {
286        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
287    }
288}
289
290
291void BITestData::clearResults() {
292    fActualBreakPositions.removeAllElements();
293    fActualTags.removeAllElements();
294}
295
296
297//--------------------------------------------------------------------------------------
298//
299//    RBBITest    constructor and destructor
300//
301//--------------------------------------------------------------------------------------
302
303RBBITest::RBBITest() {
304}
305
306
307RBBITest::~RBBITest() {
308}
309
310//-----------------------------------------------------------------------------------
311//
312//   Test for status {tag} return value from break rules.
313//        TODO:  a more thorough test.
314//
315//-----------------------------------------------------------------------------------
316void RBBITest::TestStatusReturn() {
317     UnicodeString rulesString1("$Letters = [:L:];\n"
318                                  "$Numbers = [:N:];\n"
319                                  "$Letters+{1};\n"
320                                  "$Numbers+{2};\n"
321                                  "Help\\ {4}/me\\!;\n"
322                                  "[^$Letters $Numbers];\n"
323                                  "!.*;\n", -1, US_INV);
324     UnicodeString testString1  = "abc123..abc Help me Help me!";
325                                // 01234567890123456789012345678
326     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
328
329     UErrorCode status=U_ZERO_ERROR;
330     UParseError    parseError;
331
332     BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333     if(U_FAILURE(status)) {
334         dataerrln("FAIL : in construction - %s", u_errorName(status));
335     } else {
336         int32_t  pos;
337         int32_t  i = 0;
338         bi->setText(testString1);
339         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340             if (pos != bounds1[i]) {
341                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
342                 break;
343             }
344
345             int tag = bi->getRuleStatus();
346             if (tag != brkStatus[i]) {
347                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348                 break;
349             }
350             i++;
351         }
352     }
353     delete bi;
354}
355
356
357static void printStringBreaks(UnicodeString ustr, int expected[],
358                              int expectedcount)
359{
360    UErrorCode status = U_ZERO_ERROR;
361    char name[100];
362    printf("code    alpha extend alphanum type word sent line name\n");
363    int j;
364    for (j = 0; j < ustr.length(); j ++) {
365        if (expectedcount > 0) {
366            int k;
367            for (k = 0; k < expectedcount; k ++) {
368                if (j == expected[k]) {
369                    printf("------------------------------------------------ %d\n",
370                           j);
371                }
372            }
373        }
374        UChar32 c = ustr.char32At(j);
375        if (c > 0xffff) {
376            j ++;
377        }
378        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380                           u_isUAlphabetic(c),
381                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382                           u_isalnum(c),
383                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384                                                  u_charType(c),
385                                                  U_SHORT_PROPERTY_NAME),
386                           u_getPropertyValueName(UCHAR_WORD_BREAK,
387                                                  u_getIntPropertyValue(c,
388                                                          UCHAR_WORD_BREAK),
389                                                  U_SHORT_PROPERTY_NAME),
390                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391                                   u_getIntPropertyValue(c,
392                                           UCHAR_SENTENCE_BREAK),
393                                   U_SHORT_PROPERTY_NAME),
394                           u_getPropertyValueName(UCHAR_LINE_BREAK,
395                                   u_getIntPropertyValue(c,
396                                           UCHAR_LINE_BREAK),
397                                   U_SHORT_PROPERTY_NAME),
398                           name);
399    }
400}
401
402
403void RBBITest::TestBug3818() {
404    UErrorCode  status = U_ZERO_ERROR;
405
406    // Four Thai words...
407    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409    UnicodeString  thaiStr(thaiWordData);
410
411    BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
412    if (U_FAILURE(status) || bi == NULL) {
413        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
414        return;
415    }
416    bi->setText(thaiStr);
417
418    int32_t  startOfSecondWord = bi->following(1);
419    if (startOfSecondWord != 4) {
420        errln("Fail at file %s, line %d expected start of word at 4, got %d",
421            __FILE__, __LINE__, startOfSecondWord);
422    }
423    startOfSecondWord = bi->following(0);
424    if (startOfSecondWord != 4) {
425        errln("Fail at file %s, line %d expected start of word at 4, got %d",
426            __FILE__, __LINE__, startOfSecondWord);
427    }
428    delete bi;
429}
430
431//----------------------------------------------------------------------------
432//
433// generalIteratorTest      Given a break iterator and a set of test data,
434//                          Run the tests and report the results.
435//
436//----------------------------------------------------------------------------
437void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
438{
439
440    bi.setText(td.fDataToBreak);
441
442    testFirstAndNext(bi, td);
443
444    testLastAndPrevious(bi, td);
445
446    testFollowing(bi, td);
447    testPreceding(bi, td);
448    testIsBoundary(bi, td);
449    doMultipleSelectionTest(bi, td);
450}
451
452
453//
454//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
455//                       kind of loop.
456//
457void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
458{
459    UErrorCode  status = U_ZERO_ERROR;
460    int32_t     p;
461    int32_t     lastP = -1;
462    int32_t     tag;
463
464    logln("Test first and next");
465    bi.setText(td.fDataToBreak);
466    td.clearResults();
467
468    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
469        td.fActualBreakPositions.addElement(p, status);  // Save result.
470        tag = bi.getRuleStatus();
471        td.fActualTags.addElement(tag, status);
472        if (p <= lastP) {
473            // If the iterator is not making forward progress, stop.
474            //  No need to raise an error here, it'll be detected in the normal check of results.
475            break;
476        }
477        lastP = p;
478    }
479    td.checkResults("testFirstAndNext", this);
480}
481
482
483//
484//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
485//
486void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
487{
488    UErrorCode  status = U_ZERO_ERROR;
489    int32_t     p;
490    int32_t     lastP  = 0x7ffffffe;
491    int32_t     tag;
492
493    logln("Test last and previous");
494    bi.setText(td.fDataToBreak);
495    td.clearResults();
496
497    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
498        // Save break position.  Insert it at start of vector of results, shoving
499        //    already-saved results further towards the end.
500        td.fActualBreakPositions.insertElementAt(p, 0, status);
501        // bi.previous();   // TODO:  Why does this fix things up????
502        // bi.next();
503        tag = bi.getRuleStatus();
504        td.fActualTags.insertElementAt(tag, 0, status);
505        if (p >= lastP) {
506            // If the iterator is not making progress, stop.
507            //  No need to raise an error here, it'll be detected in the normal check of results.
508            break;
509        }
510        lastP = p;
511    }
512    td.checkResults("testLastAndPrevious", this);
513}
514
515
516void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
517{
518    UErrorCode  status = U_ZERO_ERROR;
519    int32_t     p;
520    int32_t     tag;
521    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
522                                 //   cannot be -1; that is returned for DONE.
523    int         i;
524
525    logln("testFollowing():");
526    bi.setText(td.fDataToBreak);
527    td.clearResults();
528
529    // Save the starting point, since we won't get that out of following.
530    p = bi.first();
531    td.fActualBreakPositions.addElement(p, status);  // Save result.
532    tag = bi.getRuleStatus();
533    td.fActualTags.addElement(tag, status);
534
535    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
536        p = bi.following(i);
537        if (p != lastP) {
538            if (p == RuleBasedBreakIterator::DONE) {
539                break;
540            }
541            // We've reached a new break position.  Save it.
542            td.fActualBreakPositions.addElement(p, status);  // Save result.
543            tag = bi.getRuleStatus();
544            td.fActualTags.addElement(tag, status);
545            lastP = p;
546        }
547    }
548    // The loop normally exits by means of the break in the middle.
549    // Make sure that the index was at the correct position for the break iterator to have
550    //   returned DONE.
551    if (i != td.fDataToBreak.length()) {
552        errln("testFollowing():  iterator returned DONE prematurely.");
553    }
554
555    // Full check of all results.
556    td.checkResults("testFollowing", this);
557}
558
559
560
561void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
562    UErrorCode  status = U_ZERO_ERROR;
563    int32_t     p;
564    int32_t     tag;
565    int32_t     lastP  = 0x7ffffffe;
566    int         i;
567
568    logln("testPreceding():");
569    bi.setText(td.fDataToBreak);
570    td.clearResults();
571
572    p = bi.last();
573    td.fActualBreakPositions.addElement(p, status);
574    tag = bi.getRuleStatus();
575    td.fActualTags.addElement(tag, status);
576
577    for (i = td.fDataToBreak.length(); i>=-1; i--) {
578        p = bi.preceding(i);
579        if (p != lastP) {
580            if (p == RuleBasedBreakIterator::DONE) {
581                break;
582            }
583            // We've reached a new break position.  Save it.
584            td.fActualBreakPositions.insertElementAt(p, 0, status);
585            lastP = p;
586            tag = bi.getRuleStatus();
587            td.fActualTags.insertElementAt(tag, 0, status);
588        }
589    }
590    // The loop normally exits by means of the break in the middle.
591    // Make sure that the index was at the correct position for the break iterator to have
592    //   returned DONE.
593    if (i != 0) {
594        errln("testPreceding():  iterator returned DONE prematurely.");
595    }
596
597    // Full check of all results.
598    td.checkResults("testPreceding", this);
599}
600
601
602
603void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
604    UErrorCode  status = U_ZERO_ERROR;
605    int         i;
606    int32_t     tag;
607
608    logln("testIsBoundary():");
609    bi.setText(td.fDataToBreak);
610    td.clearResults();
611
612    for (i = 0; i <= td.fDataToBreak.length(); i++) {
613        if (bi.isBoundary(i)) {
614            td.fActualBreakPositions.addElement(i, status);  // Save result.
615            tag = bi.getRuleStatus();
616            td.fActualTags.addElement(tag, status);
617        }
618    }
619    td.checkResults("testIsBoundary: ", this);
620}
621
622
623
624void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
625{
626    iterator.setText(td.fDataToBreak);
627
628    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
629    int32_t offset = iterator.first();
630    int32_t testOffset;
631    int32_t count = 0;
632
633    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
634
635    if (*testIterator != iterator)
636        errln("clone() or operator!= failed: two clones compared unequal");
637
638    do {
639        testOffset = testIterator->first();
640        testOffset = testIterator->next(count);
641        if (offset != testOffset)
642            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
643
644        if (offset != RuleBasedBreakIterator::DONE) {
645            count++;
646            offset = iterator.next();
647
648            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
649                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
650                if (count > 10000 || offset == -1) {
651                    errln("operator== failed too many times. Stopping test.");
652                    if (offset == -1) {
653                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
654                    }
655                    return;
656                }
657            }
658        }
659    } while (offset != RuleBasedBreakIterator::DONE);
660
661    // now do it backwards...
662    offset = iterator.last();
663    count = 0;
664
665    do {
666        testOffset = testIterator->last();
667        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
668        if (offset != testOffset)
669            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
670
671        if (offset != RuleBasedBreakIterator::DONE) {
672            count--;
673            offset = iterator.previous();
674        }
675    } while (offset != RuleBasedBreakIterator::DONE);
676
677    delete testIterator;
678}
679
680
681//---------------------------------------------
682//
683//     other tests
684//
685//---------------------------------------------
686void RBBITest::TestEmptyString()
687{
688    UnicodeString text = "";
689    UErrorCode status = U_ZERO_ERROR;
690
691    BITestData x(status);
692    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
693    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
694    if (U_FAILURE(status))
695    {
696        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
697        return;
698    }
699    generalIteratorTest(*bi, x);
700    delete bi;
701}
702
703void RBBITest::TestGetAvailableLocales()
704{
705    int32_t locCount = 0;
706    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
707
708    if (locCount == 0)
709        dataerrln("getAvailableLocales() returned an empty list!");
710    // Just make sure that it's returning good memory.
711    int32_t i;
712    for (i = 0; i < locCount; ++i) {
713        logln(locList[i].getName());
714    }
715}
716
717//Testing the BreakIterator::getDisplayName() function
718void RBBITest::TestGetDisplayName()
719{
720    UnicodeString   result;
721
722    BreakIterator::getDisplayName(Locale::getUS(), result);
723    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
724        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
725                + result);
726
727    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
728    if (result != "French (France)")
729        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
730                + result);
731}
732/**
733 * Test End Behaviour
734 * @bug 4068137
735 */
736void RBBITest::TestEndBehaviour()
737{
738    UErrorCode status = U_ZERO_ERROR;
739    UnicodeString testString("boo.");
740    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
741    if (U_FAILURE(status))
742    {
743        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
744        return;
745    }
746    wb->setText(testString);
747
748    if (wb->first() != 0)
749        errln("Didn't get break at beginning of string.");
750    if (wb->next() != 3)
751        errln("Didn't get break before period in \"boo.\"");
752    if (wb->current() != 4 && wb->next() != 4)
753        errln("Didn't get break at end of string.");
754    delete wb;
755}
756/*
757 * @bug 4153072
758 */
759void RBBITest::TestBug4153072() {
760    UErrorCode status = U_ZERO_ERROR;
761    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
762    if (U_FAILURE(status))
763    {
764        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
765        return;
766    }
767    UnicodeString str("...Hello, World!...");
768    int32_t begin = 3;
769    int32_t end = str.length() - 3;
770    UBool onBoundary;
771
772    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
773    iter->adoptText(textIterator);
774    int index;
775    // Note: with the switch to UText, there is no way to restrict the
776    //       iteration range to begin at an index other than zero.
777    //       String character iterators created with a non-zero bound are
778    //         treated by RBBI as being empty.
779    for (index = -1; index < begin + 1; ++index) {
780        onBoundary = iter->isBoundary(index);
781        if (index == 0?  !onBoundary : onBoundary) {
782            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
783                            " and begin index = " + begin);
784        }
785    }
786    delete iter;
787}
788
789
790//
791// Test for problem reported by Ashok Matoria on 9 July 2007
792//    One.<kSoftHyphen><kSpace>Two.
793//
794//    Sentence break at start (0) and then on calling next() it breaks at
795//   'T' of "Two". Now, at this point if I do next() and
796//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
797//
798void RBBITest::TestBug5775() {
799    UErrorCode status = U_ZERO_ERROR;
800    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
801    TEST_ASSERT_SUCCESS(status);
802    if (U_FAILURE(status)) {
803        return;
804    }
805// Check for status first for better handling of no data errors.
806    TEST_ASSERT(bi != NULL);
807    if (bi == NULL) {
808        return;
809    }
810
811    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
812    //               01234      56789
813    s = s.unescape();
814    bi->setText(s);
815    int pos = bi->next();
816    TEST_ASSERT(pos == 6);
817    pos = bi->next();
818    TEST_ASSERT(pos == 10);
819    pos = bi->previous();
820    TEST_ASSERT(pos == 6);
821    delete bi;
822}
823
824
825
826//------------------------------------------------------------------------------
827//
828//   RBBITest::Extended    Run  RBBI Tests from an external test data file
829//
830//------------------------------------------------------------------------------
831
832struct TestParams {
833    BreakIterator   *bi;
834    UnicodeString    dataToBreak;
835    UVector32       *expectedBreaks;
836    UVector32       *srcLine;
837    UVector32       *srcCol;
838};
839
840void RBBITest::executeTest(TestParams *t) {
841    int32_t    bp;
842    int32_t    prevBP;
843    int32_t    i;
844
845    if (t->bi == NULL) {
846        return;
847    }
848
849    t->bi->setText(t->dataToBreak);
850    //
851    //  Run the iterator forward
852    //
853    prevBP = -1;
854    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
855        if (prevBP ==  bp) {
856            // Fail for lack of forward progress.
857            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
858                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
859            break;
860        }
861
862        // Check that there were we didn't miss an expected break between the last one
863        //  and this one.
864        for (i=prevBP+1; i<bp; i++) {
865            if (t->expectedBreaks->elementAti(i) != 0) {
866                int expected[] = {0, i};
867                printStringBreaks(t->dataToBreak, expected, 2);
868                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
869                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
870            }
871        }
872
873        // Check that the break we did find was expected
874        if (t->expectedBreaks->elementAti(bp) == 0) {
875            int expected[] = {0, bp};
876            printStringBreaks(t->dataToBreak, expected, 2);
877            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
878                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
879        } else {
880            // The break was expected.
881            //   Check that the {nnn} tag value is correct.
882            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
883            if (expectedTagVal == -1) {
884                expectedTagVal = 0;
885            }
886            int32_t line = t->srcLine->elementAti(bp);
887            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
888            if (rs != expectedTagVal) {
889                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
890                      "          Actual, Expected status = %4d, %4d",
891                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
892            }
893        }
894
895
896        prevBP = bp;
897    }
898
899    // Verify that there were no missed expected breaks after the last one found
900    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
901        if (t->expectedBreaks->elementAti(i) != 0) {
902            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
903                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
904        }
905    }
906
907    //
908    //  Run the iterator backwards, verify that the same breaks are found.
909    //
910    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
911    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
912        if (prevBP ==  bp) {
913            // Fail for lack of progress.
914            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
915                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
916            break;
917        }
918
919        // Check that there were we didn't miss an expected break between the last one
920        //  and this one.  (UVector returns zeros for index out of bounds.)
921        for (i=prevBP-1; i>bp; i--) {
922            if (t->expectedBreaks->elementAti(i) != 0) {
923                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
924                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
925            }
926        }
927
928        // Check that the break we did find was expected
929        if (t->expectedBreaks->elementAti(bp) == 0) {
930            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
931                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
932        } else {
933            // The break was expected.
934            //   Check that the {nnn} tag value is correct.
935            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
936            if (expectedTagVal == -1) {
937                expectedTagVal = 0;
938            }
939            int line = t->srcLine->elementAti(bp);
940            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
941            if (rs != expectedTagVal) {
942                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
943                      "          Actual, Expected status = %4d, %4d",
944                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
945            }
946        }
947
948        prevBP = bp;
949    }
950
951    // Verify that there were no missed breaks prior to the last one found
952    for (i=prevBP-1; i>=0; i--) {
953        if (t->expectedBreaks->elementAti(i) != 0) {
954            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
955                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
956        }
957    }
958
959    // Check isBoundary()
960    for (i=0; i<t->expectedBreaks->size(); i++) {
961        UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
962        UBool boundaryFound    = t->bi->isBoundary(i);
963        if (boundaryExpected != boundaryFound) {
964            errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
965                  "        Expected, Actual= %s, %s",
966                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
967                  boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
968        }
969    }
970
971    // Check following()
972    for (i=0; i<t->expectedBreaks->size(); i++) {
973        int32_t actualBreak = t->bi->following(i);
974        int32_t expectedBreak = BreakIterator::DONE;
975        for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
976            if (t->expectedBreaks->elementAti(j) != 0) {
977                expectedBreak = j;
978                break;
979            }
980        }
981        if (expectedBreak != actualBreak) {
982            errln("following(%d) incorrect. File line,col= %4d,%4d\n"
983                  "        Expected, Actual= %d, %d",
984                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
985        }
986    }
987
988    // Check preceding()
989    for (i=t->expectedBreaks->size(); i>=0; i--) {
990        int32_t actualBreak = t->bi->preceding(i);
991        int32_t expectedBreak = BreakIterator::DONE;
992
993        for (int32_t j=i-1; j >= 0; j--) {
994            if (t->expectedBreaks->elementAti(j) != 0) {
995                expectedBreak = j;
996                break;
997            }
998        }
999        if (expectedBreak != actualBreak) {
1000            errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1001                  "        Expected, Actual= %d, %d",
1002                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1003        }
1004    }
1005}
1006
1007
1008void RBBITest::TestExtended() {
1009#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1010    UErrorCode      status  = U_ZERO_ERROR;
1011    Locale          locale("");
1012
1013    UnicodeString       rules;
1014    TestParams          tp;
1015    tp.bi             = NULL;
1016    tp.expectedBreaks = new UVector32(status);
1017    tp.srcLine        = new UVector32(status);
1018    tp.srcCol         = new UVector32(status);
1019
1020    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1021    if (U_FAILURE(status)) {
1022        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1023    }
1024
1025
1026    //
1027    //  Open and read the test data file.
1028    //
1029    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1030    char testFileName[1000];
1031    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1032        errln("Can't open test data.  Path too long.");
1033        return;
1034    }
1035    strcpy(testFileName, testDataDirectory);
1036    strcat(testFileName, "rbbitst.txt");
1037
1038    int    len;
1039    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1040    if (U_FAILURE(status)) {
1041        return; /* something went wrong, error already output */
1042    }
1043
1044
1045
1046
1047    //
1048    //  Put the test data into a UnicodeString
1049    //
1050    UnicodeString testString(FALSE, testFile, len);
1051
1052    enum EParseState{
1053        PARSE_COMMENT,
1054        PARSE_TAG,
1055        PARSE_DATA,
1056        PARSE_NUM
1057    }
1058    parseState = PARSE_TAG;
1059
1060    EParseState savedState = PARSE_TAG;
1061
1062    static const UChar CH_LF        = 0x0a;
1063    static const UChar CH_CR        = 0x0d;
1064    static const UChar CH_HASH      = 0x23;
1065    /*static const UChar CH_PERIOD    = 0x2e;*/
1066    static const UChar CH_LT        = 0x3c;
1067    static const UChar CH_GT        = 0x3e;
1068    static const UChar CH_BACKSLASH = 0x5c;
1069    static const UChar CH_BULLET    = 0x2022;
1070
1071    int32_t    lineNum  = 1;
1072    int32_t    colStart = 0;
1073    int32_t    column   = 0;
1074    int32_t    charIdx  = 0;
1075
1076    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1077
1078    for (charIdx = 0; charIdx < len; ) {
1079        status = U_ZERO_ERROR;
1080        UChar  c = testString.charAt(charIdx);
1081        charIdx++;
1082        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1083            // treat CRLF as a unit
1084            c = CH_LF;
1085            charIdx++;
1086        }
1087        if (c == CH_LF || c == CH_CR) {
1088            lineNum++;
1089            colStart = charIdx;
1090        }
1091        column = charIdx - colStart + 1;
1092
1093        switch (parseState) {
1094        case PARSE_COMMENT:
1095            if (c == 0x0a || c == 0x0d) {
1096                parseState = savedState;
1097            }
1098            break;
1099
1100        case PARSE_TAG:
1101            {
1102            if (c == CH_HASH) {
1103                parseState = PARSE_COMMENT;
1104                savedState = PARSE_TAG;
1105                break;
1106            }
1107            if (u_isUWhiteSpace(c)) {
1108                break;
1109            }
1110            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1111                delete tp.bi;
1112                tp.bi = BreakIterator::createWordInstance(locale,  status);
1113                charIdx += 5;
1114                break;
1115            }
1116            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1117                delete tp.bi;
1118                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1119                charIdx += 5;
1120                break;
1121            }
1122            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1123                delete tp.bi;
1124                tp.bi = BreakIterator::createLineInstance(locale,  status);
1125                charIdx += 5;
1126                break;
1127            }
1128            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1129                delete tp.bi;
1130                tp.bi = NULL;
1131                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1132                charIdx += 5;
1133                break;
1134            }
1135            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1136                delete tp.bi;
1137                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1138                charIdx += 6;
1139                break;
1140            }
1141
1142            // <locale  loc_name>
1143            localeMatcher.reset(testString);
1144            if (localeMatcher.lookingAt(charIdx-1, status)) {
1145                UnicodeString localeName = localeMatcher.group(1, status);
1146                char localeName8[100];
1147                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1148                locale = Locale::createFromName(localeName8);
1149                charIdx += localeMatcher.group(0, status).length() - 1;
1150                TEST_ASSERT_SUCCESS(status);
1151                break;
1152            }
1153            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1154                parseState = PARSE_DATA;
1155                charIdx += 5;
1156                tp.dataToBreak = "";
1157                tp.expectedBreaks->removeAllElements();
1158                tp.srcCol ->removeAllElements();
1159                tp.srcLine->removeAllElements();
1160                break;
1161            }
1162
1163            errln("line %d: Tag expected in test file.", lineNum);
1164            parseState = PARSE_COMMENT;
1165            savedState = PARSE_DATA;
1166            goto end_test; // Stop the test.
1167            }
1168            break;
1169
1170        case PARSE_DATA:
1171            if (c == CH_BULLET) {
1172                int32_t  breakIdx = tp.dataToBreak.length();
1173                tp.expectedBreaks->setSize(breakIdx+1);
1174                tp.expectedBreaks->setElementAt(-1, breakIdx);
1175                tp.srcLine->setSize(breakIdx+1);
1176                tp.srcLine->setElementAt(lineNum, breakIdx);
1177                tp.srcCol ->setSize(breakIdx+1);
1178                tp.srcCol ->setElementAt(column, breakIdx);
1179                break;
1180            }
1181
1182            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1183                // Add final entry to mappings from break location to source file position.
1184                //  Need one extra because last break position returned is after the
1185                //    last char in the data, not at the last char.
1186                tp.srcLine->addElement(lineNum, status);
1187                tp.srcCol ->addElement(column, status);
1188
1189                parseState = PARSE_TAG;
1190                charIdx += 6;
1191
1192                // RUN THE TEST!
1193                executeTest(&tp);
1194                break;
1195            }
1196
1197            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1198                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1199                // Get the code point from the name and insert it into the test data.
1200                //   (Damn, no API takes names in Unicode  !!!
1201                //    we've got to take it back to char *)
1202                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1203                int32_t nameLength = nameEndIdx - (charIdx+2);
1204                char charNameBuf[200];
1205                UChar32 theChar = -1;
1206                if (nameEndIdx != -1) {
1207                    UErrorCode status = U_ZERO_ERROR;
1208                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1209                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1210                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1211                    if (U_FAILURE(status)) {
1212                        theChar = -1;
1213                    }
1214                }
1215                if (theChar == -1) {
1216                    errln("Error in named character in test file at line %d, col %d",
1217                        lineNum, column);
1218                } else {
1219                    // Named code point was recognized.  Insert it
1220                    //   into the test data.
1221                    tp.dataToBreak.append(theChar);
1222                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1223                        tp.srcLine->addElement(lineNum, status);
1224                        tp.srcCol ->addElement(column, status);
1225                    }
1226                }
1227                if (nameEndIdx > charIdx) {
1228                    charIdx = nameEndIdx+1;
1229
1230                }
1231                break;
1232            }
1233
1234
1235
1236
1237            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1238                charIdx++;
1239                int32_t  breakIdx = tp.dataToBreak.length();
1240                tp.expectedBreaks->setSize(breakIdx+1);
1241                tp.expectedBreaks->setElementAt(-1, breakIdx);
1242                tp.srcLine->setSize(breakIdx+1);
1243                tp.srcLine->setElementAt(lineNum, breakIdx);
1244                tp.srcCol ->setSize(breakIdx+1);
1245                tp.srcCol ->setElementAt(column, breakIdx);
1246                break;
1247            }
1248
1249            if (c == CH_LT) {
1250                tagValue   = 0;
1251                parseState = PARSE_NUM;
1252                break;
1253            }
1254
1255            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1256                parseState = PARSE_COMMENT;
1257                savedState = PARSE_DATA;
1258                break;
1259            }
1260
1261            if (c == CH_BACKSLASH) {
1262                // Check for \ at end of line, a line continuation.
1263                //     Advance over (discard) the newline
1264                UChar32 cp = testString.char32At(charIdx);
1265                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1266                    // We have a CR LF
1267                    //  Need an extra increment of the input ptr to move over both of them
1268                    charIdx++;
1269                }
1270                if (cp == CH_LF || cp == CH_CR) {
1271                    lineNum++;
1272                    colStart = charIdx;
1273                    charIdx++;
1274                    break;
1275                }
1276
1277                // Let unescape handle the back slash.
1278                cp = testString.unescapeAt(charIdx);
1279                if (cp != -1) {
1280                    // Escape sequence was recognized.  Insert the char
1281                    //   into the test data.
1282                    tp.dataToBreak.append(cp);
1283                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1284                        tp.srcLine->addElement(lineNum, status);
1285                        tp.srcCol ->addElement(column, status);
1286                    }
1287                    break;
1288                }
1289
1290
1291                // Not a recognized backslash escape sequence.
1292                // Take the next char as a literal.
1293                //  TODO:  Should this be an error?
1294                c = testString.charAt(charIdx);
1295                charIdx = testString.moveIndex32(charIdx, 1);
1296            }
1297
1298            // Normal, non-escaped data char.
1299            tp.dataToBreak.append(c);
1300
1301            // Save the mapping from offset in the data to line/column numbers in
1302            //   the original input file.  Will be used for better error messages only.
1303            //   If there's an expected break before this char, the slot in the mapping
1304            //     vector will already be set for this char; don't overwrite it.
1305            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1306                tp.srcLine->addElement(lineNum, status);
1307                tp.srcCol ->addElement(column, status);
1308            }
1309            break;
1310
1311
1312        case PARSE_NUM:
1313            // We are parsing an expected numeric tag value, like <1234>,
1314            //   within a chunk of data.
1315            if (u_isUWhiteSpace(c)) {
1316                break;
1317            }
1318
1319            if (c == CH_GT) {
1320                // Finished the number.  Add the info to the expected break data,
1321                //   and switch parse state back to doing plain data.
1322                parseState = PARSE_DATA;
1323                if (tagValue == 0) {
1324                    tagValue = -1;
1325                }
1326                int32_t  breakIdx = tp.dataToBreak.length();
1327                tp.expectedBreaks->setSize(breakIdx+1);
1328                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1329                tp.srcLine->setSize(breakIdx+1);
1330                tp.srcLine->setElementAt(lineNum, breakIdx);
1331                tp.srcCol ->setSize(breakIdx+1);
1332                tp.srcCol ->setElementAt(column, breakIdx);
1333                break;
1334            }
1335
1336            if (u_isdigit(c)) {
1337                tagValue = tagValue*10 + u_charDigitValue(c);
1338                break;
1339            }
1340
1341            errln("Syntax Error in test file at line %d, col %d",
1342                lineNum, column);
1343            parseState = PARSE_COMMENT;
1344            goto end_test; // Stop the test
1345            break;
1346        }
1347
1348
1349        if (U_FAILURE(status)) {
1350            dataerrln("ICU Error %s while parsing test file at line %d.",
1351                u_errorName(status), lineNum);
1352            status = U_ZERO_ERROR;
1353            goto end_test; // Stop the test
1354        }
1355
1356    }
1357
1358end_test:
1359    delete tp.bi;
1360    delete tp.expectedBreaks;
1361    delete tp.srcLine;
1362    delete tp.srcCol;
1363    delete [] testFile;
1364#endif
1365}
1366
1367
1368//-------------------------------------------------------------------------------
1369//
1370//  TestDictRules   create a break iterator from source rules that includes a
1371//                  dictionary range.   Regression for bug #7130.  Source rules
1372//                  do not declare a break iterator type (word, line, sentence, etc.
1373//                  but the dictionary code, without a type, would loop.
1374//
1375//-------------------------------------------------------------------------------
1376void RBBITest::TestDictRules() {
1377    const char *rules =  "$dictionary = [a-z]; \n"
1378                         "!!forward; \n"
1379                         "$dictionary $dictionary; \n"
1380                         "!!reverse; \n"
1381                         "$dictionary $dictionary; \n";
1382    const char *text = "aa";
1383    UErrorCode status = U_ZERO_ERROR;
1384    UParseError parseError;
1385
1386    RuleBasedBreakIterator bi(rules, parseError, status);
1387    if (U_SUCCESS(status)) {
1388        UnicodeString utext = text;
1389        bi.setText(utext);
1390        int32_t position;
1391        int32_t loops;
1392        for (loops = 0; loops<10; loops++) {
1393            position = bi.next();
1394            if (position == RuleBasedBreakIterator::DONE) {
1395                break;
1396            }
1397        }
1398        TEST_ASSERT(loops == 1);
1399    } else {
1400        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1401    }
1402}
1403
1404
1405
1406//-------------------------------------------------------------------------------
1407//
1408//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1409//    return the datain one big UChar * buffer, which the caller must delete.
1410//
1411//    parameters:
1412//          fileName:   the name of the file, with no directory part.  The test data directory
1413//                      is assumed.
1414//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1415//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1416//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1417//                      Pass NULL for the system default encoding.
1418//          status
1419//    returns:
1420//                      The file data, converted to UChar.
1421//                      The caller must delete this when done with
1422//                           delete [] theBuffer;
1423//
1424//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1425//           Move this function to some common place.
1426//
1427//--------------------------------------------------------------------------------
1428UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1429    UChar       *retPtr  = NULL;
1430    char        *fileBuf = NULL;
1431    UConverter* conv     = NULL;
1432    FILE        *f       = NULL;
1433
1434    ulen = 0;
1435    if (U_FAILURE(status)) {
1436        return retPtr;
1437    }
1438
1439    //
1440    //  Open the file.
1441    //
1442    f = fopen(fileName, "rb");
1443    if (f == 0) {
1444        dataerrln("Error opening test data file %s\n", fileName);
1445        status = U_FILE_ACCESS_ERROR;
1446        return NULL;
1447    }
1448    //
1449    //  Read it in
1450    //
1451    int   fileSize;
1452    int   amt_read;
1453
1454    fseek( f, 0, SEEK_END);
1455    fileSize = ftell(f);
1456    fileBuf = new char[fileSize];
1457    fseek(f, 0, SEEK_SET);
1458    amt_read = fread(fileBuf, 1, fileSize, f);
1459    if (amt_read != fileSize || fileSize <= 0) {
1460        errln("Error reading test data file.");
1461        goto cleanUpAndReturn;
1462    }
1463
1464    //
1465    // Look for a Unicode Signature (BOM) on the data just read
1466    //
1467    int32_t        signatureLength;
1468    const char *   fileBufC;
1469    const char*    bomEncoding;
1470
1471    fileBufC = fileBuf;
1472    bomEncoding = ucnv_detectUnicodeSignature(
1473        fileBuf, fileSize, &signatureLength, &status);
1474    if(bomEncoding!=NULL ){
1475        fileBufC  += signatureLength;
1476        fileSize  -= signatureLength;
1477        encoding = bomEncoding;
1478    }
1479
1480    //
1481    // Open a converter to take the rule file to UTF-16
1482    //
1483    conv = ucnv_open(encoding, &status);
1484    if (U_FAILURE(status)) {
1485        goto cleanUpAndReturn;
1486    }
1487
1488    //
1489    // Convert the rules to UChar.
1490    //  Preflight first to determine required buffer size.
1491    //
1492    ulen = ucnv_toUChars(conv,
1493        NULL,           //  dest,
1494        0,              //  destCapacity,
1495        fileBufC,
1496        fileSize,
1497        &status);
1498    if (status == U_BUFFER_OVERFLOW_ERROR) {
1499        // Buffer Overflow is expected from the preflight operation.
1500        status = U_ZERO_ERROR;
1501
1502        retPtr = new UChar[ulen+1];
1503        ucnv_toUChars(conv,
1504            retPtr,       //  dest,
1505            ulen+1,
1506            fileBufC,
1507            fileSize,
1508            &status);
1509    }
1510
1511cleanUpAndReturn:
1512    fclose(f);
1513    delete []fileBuf;
1514    ucnv_close(conv);
1515    if (U_FAILURE(status)) {
1516        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1517        delete []retPtr;
1518        retPtr = 0;
1519        ulen   = 0;
1520    };
1521    return retPtr;
1522}
1523
1524
1525
1526//--------------------------------------------------------------------------------------------
1527//
1528//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1529//
1530//-------------------------------------------------------------------------------------------
1531void RBBITest::TestUnicodeFiles() {
1532    RuleBasedBreakIterator  *bi;
1533    UErrorCode               status = U_ZERO_ERROR;
1534
1535    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1536    TEST_ASSERT_SUCCESS(status);
1537    if (U_SUCCESS(status)) {
1538        runUnicodeTestData("GraphemeBreakTest.txt", bi);
1539    }
1540    delete bi;
1541
1542    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1543    TEST_ASSERT_SUCCESS(status);
1544    if (U_SUCCESS(status)) {
1545        runUnicodeTestData("WordBreakTest.txt", bi);
1546    }
1547    delete bi;
1548
1549    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1550    TEST_ASSERT_SUCCESS(status);
1551    if (U_SUCCESS(status)) {
1552        runUnicodeTestData("SentenceBreakTest.txt", bi);
1553    }
1554    delete bi;
1555
1556    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1557    TEST_ASSERT_SUCCESS(status);
1558    if (U_SUCCESS(status)) {
1559        runUnicodeTestData("LineBreakTest.txt", bi);
1560    }
1561    delete bi;
1562}
1563
1564
1565//--------------------------------------------------------------------------------------------
1566//
1567//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1568//
1569//-------------------------------------------------------------------------------------------
1570void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1571#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1572    // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1573    UBool isTicket7270Fixed = !logKnownIssue("7270");
1574    UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1575    UErrorCode  status = U_ZERO_ERROR;
1576
1577    //
1578    //  Open and read the test data file, put it into a UnicodeString.
1579    //
1580    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1581    char testFileName[1000];
1582    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1583        dataerrln("Can't open test data.  Path too long.");
1584        return;
1585    }
1586    strcpy(testFileName, testDataDirectory);
1587    strcat(testFileName, fileName);
1588
1589    logln("Opening data file %s\n", fileName);
1590
1591    int    len;
1592    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1593    if (status != U_FILE_ACCESS_ERROR) {
1594        TEST_ASSERT_SUCCESS(status);
1595        TEST_ASSERT(testFile != NULL);
1596    }
1597    if (U_FAILURE(status) || testFile == NULL) {
1598        return; /* something went wrong, error already output */
1599    }
1600    UnicodeString testFileAsString(TRUE, testFile, len);
1601
1602    //
1603    //  Parse the test data file using a regular expression.
1604    //  Each kind of token is recognized in its own capture group; what type of item was scanned
1605    //     is identified by which group had a match.
1606    //
1607    //    Caputure Group #                  1          2            3            4           5
1608    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1609    //
1610    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1611    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1612    UnicodeString   testString;
1613    UVector32       breakPositions(status);
1614    int             lineNumber = 1;
1615    TEST_ASSERT_SUCCESS(status);
1616    if (U_FAILURE(status)) {
1617        return;
1618    }
1619
1620    //
1621    //  Scan through each test case, building up the string to be broken in testString,
1622    //   and the positions that should be boundaries in the breakPositions vector.
1623    //
1624    int spin = 0;
1625    while (tokenMatcher.find()) {
1626      	if(tokenMatcher.hitEnd()) {
1627          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1628             This occurred when the text file was corrupt (wasn't marked as UTF-8)
1629             and caused an infinite loop here on EBCDIC systems!
1630          */
1631          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1632          //	   return;
1633      	}
1634        if (tokenMatcher.start(1, status) >= 0) {
1635            // Scanned a divide sign, indicating a break position in the test data.
1636            if (testString.length()>0) {
1637                breakPositions.addElement(testString.length(), status);
1638            }
1639        }
1640        else if (tokenMatcher.start(2, status) >= 0) {
1641            // Scanned an 'x', meaning no break at this position in the test data
1642            //   Nothing to be done here.
1643            }
1644        else if (tokenMatcher.start(3, status) >= 0) {
1645            // Scanned Hex digits.  Convert them to binary, append to the character data string.
1646            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1647            int length = hexNumber.length();
1648            if (length<=8) {
1649                char buf[10];
1650                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1651                UChar32 c = (UChar32)strtol(buf, NULL, 16);
1652                if (c<=0x10ffff) {
1653                    testString.append(c);
1654                } else {
1655                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1656                       fileName, lineNumber);
1657                }
1658            } else {
1659                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1660                       fileName, lineNumber);
1661             }
1662        }
1663        else if (tokenMatcher.start(4, status) >= 0) {
1664            // Scanned to end of a line, possibly skipping over a comment in the process.
1665            //   If the line from the file contained test data, run the test now.
1666            //
1667            if (testString.length() > 0) {
1668// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1669//             Rule 8
1670//                ZW SP* <break>
1671//             is not yet implemented.
1672if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1673                                            5202 == lineNumber ||
1674                                            5214 == lineNumber ||
1675                                            5246 == lineNumber ||
1676                                            5298 == lineNumber ||
1677                                            5302 == lineNumber ))) {
1678                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1679}
1680            }
1681
1682            // Clear out this test case.
1683            //    The string and breakPositions vector will be refilled as the next
1684            //       test case is parsed.
1685            testString.remove();
1686            breakPositions.removeAllElements();
1687            lineNumber++;
1688        } else {
1689            // Scanner catchall.  Something unrecognized appeared on the line.
1690            char token[16];
1691            UnicodeString uToken = tokenMatcher.group(0, status);
1692            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1693            token[sizeof(token)-1] = 0;
1694            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1695
1696            // Clean up, in preparation for continuing with the next line.
1697            testString.remove();
1698            breakPositions.removeAllElements();
1699            lineNumber++;
1700        }
1701        TEST_ASSERT_SUCCESS(status);
1702        if (U_FAILURE(status)) {
1703            break;
1704        }
1705    }
1706
1707    delete [] testFile;
1708 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1709}
1710
1711//--------------------------------------------------------------------------------------------
1712//
1713//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1714//                            test data files.  Do only a simple, forward-only check -
1715//                            this test is mostly to check that ICU and the Unicode
1716//                            data agree with each other.
1717//
1718//--------------------------------------------------------------------------------------------
1719void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1720                         const UnicodeString &testString,   // Text data to be broken
1721                         UVector32 *breakPositions,         // Positions where breaks should be found.
1722                         RuleBasedBreakIterator *bi) {
1723    int32_t pos;                 // Break Position in the test string
1724    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1725    int32_t expectedPos;         // Expected break position (index into test string)
1726
1727    bi->setText(testString);
1728    pos = bi->first();
1729    pos = bi->next();
1730
1731    while (pos != BreakIterator::DONE) {
1732        if (expectedI >= breakPositions->size()) {
1733            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1734                testFileName, lineNumber, pos);
1735            break;
1736        }
1737        expectedPos = breakPositions->elementAti(expectedI);
1738        if (pos < expectedPos) {
1739            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1740                testFileName, lineNumber, pos);
1741            break;
1742        }
1743        if (pos > expectedPos) {
1744            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1745                testFileName, lineNumber, expectedPos);
1746            break;
1747        }
1748        pos = bi->next();
1749        expectedI++;
1750    }
1751
1752    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1753        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1754            testFileName, lineNumber, breakPositions->elementAti(expectedI));
1755    }
1756}
1757
1758
1759
1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761//---------------------------------------------------------------------------------------
1762//
1763//   classs RBBIMonkeyKind
1764//
1765//      Monkey Test for Break Iteration
1766//      Abstract interface class.   Concrete derived classes independently
1767//      implement the break rules for different iterator types.
1768//
1769//      The Monkey Test itself uses doesn't know which type of break iterator it is
1770//      testing, but works purely in terms of the interface defined here.
1771//
1772//---------------------------------------------------------------------------------------
1773class RBBIMonkeyKind {
1774public:
1775    // Return a UVector of UnicodeSets, representing the character classes used
1776    //   for this type of iterator.
1777    virtual  UVector  *charClasses() = 0;
1778
1779    // Set the test text on which subsequent calls to next() will operate
1780    virtual  void      setText(const UnicodeString &s) = 0;
1781
1782    // Find the next break postion, starting from the prev break position, or from zero.
1783    // Return -1 after reaching end of string.
1784    virtual  int32_t   next(int32_t i) = 0;
1785
1786    virtual ~RBBIMonkeyKind();
1787    UErrorCode       deferredStatus;
1788
1789
1790protected:
1791    RBBIMonkeyKind();
1792
1793private:
1794};
1795
1796RBBIMonkeyKind::RBBIMonkeyKind() {
1797    deferredStatus = U_ZERO_ERROR;
1798}
1799
1800RBBIMonkeyKind::~RBBIMonkeyKind() {
1801}
1802
1803
1804//----------------------------------------------------------------------------------------
1805//
1806//   Random Numbers.  Similar to standard lib rand() and srand()
1807//                    Not using library to
1808//                      1.  Get same results on all platforms.
1809//                      2.  Get access to current seed, to more easily reproduce failures.
1810//
1811//---------------------------------------------------------------------------------------
1812static uint32_t m_seed = 1;
1813
1814static uint32_t m_rand()
1815{
1816    m_seed = m_seed * 1103515245 + 12345;
1817    return (uint32_t)(m_seed/65536) % 32768;
1818}
1819
1820
1821//------------------------------------------------------------------------------------------
1822//
1823//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1824//                             of RBBIMonkeyKind.
1825//
1826//------------------------------------------------------------------------------------------
1827class RBBICharMonkey: public RBBIMonkeyKind {
1828public:
1829    RBBICharMonkey();
1830    virtual          ~RBBICharMonkey();
1831    virtual  UVector *charClasses();
1832    virtual  void     setText(const UnicodeString &s);
1833    virtual  int32_t  next(int32_t i);
1834private:
1835    UVector   *fSets;
1836
1837    UnicodeSet  *fCRLFSet;
1838    UnicodeSet  *fControlSet;
1839    UnicodeSet  *fExtendSet;
1840    UnicodeSet  *fRegionalIndicatorSet;
1841    UnicodeSet  *fPrependSet;
1842    UnicodeSet  *fSpacingSet;
1843    UnicodeSet  *fLSet;
1844    UnicodeSet  *fVSet;
1845    UnicodeSet  *fTSet;
1846    UnicodeSet  *fLVSet;
1847    UnicodeSet  *fLVTSet;
1848    UnicodeSet  *fHangulSet;
1849    UnicodeSet  *fAnySet;
1850
1851    const UnicodeString *fText;
1852};
1853
1854
1855RBBICharMonkey::RBBICharMonkey() {
1856    UErrorCode  status = U_ZERO_ERROR;
1857
1858    fText = NULL;
1859
1860    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1861    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1862    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1863    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1864    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1865    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1866    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1867    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1868    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1869    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1870    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1871    fHangulSet  = new UnicodeSet();
1872    fHangulSet->addAll(*fLSet);
1873    fHangulSet->addAll(*fVSet);
1874    fHangulSet->addAll(*fTSet);
1875    fHangulSet->addAll(*fLVSet);
1876    fHangulSet->addAll(*fLVTSet);
1877    fAnySet     = new UnicodeSet(0, 0x10ffff);
1878
1879    fSets       = new UVector(status);
1880    fSets->addElement(fCRLFSet,    status);
1881    fSets->addElement(fControlSet, status);
1882    fSets->addElement(fExtendSet,  status);
1883    fSets->addElement(fRegionalIndicatorSet, status);
1884    if (!fPrependSet->isEmpty()) {
1885        fSets->addElement(fPrependSet, status);
1886    }
1887    fSets->addElement(fSpacingSet, status);
1888    fSets->addElement(fHangulSet,  status);
1889    fSets->addElement(fAnySet,     status);
1890    if (U_FAILURE(status)) {
1891        deferredStatus = status;
1892    }
1893}
1894
1895
1896void RBBICharMonkey::setText(const UnicodeString &s) {
1897    fText = &s;
1898}
1899
1900
1901
1902int32_t RBBICharMonkey::next(int32_t prevPos) {
1903    int    p0, p1, p2, p3;    // Indices of the significant code points around the
1904                              //   break position being tested.  The candidate break
1905                              //   location is before p2.
1906
1907    int     breakPos = -1;
1908
1909    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1910
1911    if (U_FAILURE(deferredStatus)) {
1912        return -1;
1913    }
1914
1915    // Previous break at end of string.  return DONE.
1916    if (prevPos >= fText->length()) {
1917        return -1;
1918    }
1919    p0 = p1 = p2 = p3 = prevPos;
1920    c3 =  fText->char32At(prevPos);
1921    c0 = c1 = c2 = 0;
1922    (void)p0;   // suppress set but not used warning.
1923    (void)c0;
1924
1925    // Loop runs once per "significant" character position in the input text.
1926    for (;;) {
1927        // Move all of the positions forward in the input string.
1928        p0 = p1;  c0 = c1;
1929        p1 = p2;  c1 = c2;
1930        p2 = p3;  c2 = c3;
1931
1932        // Advancd p3 by one codepoint
1933        p3 = fText->moveIndex32(p3, 1);
1934        c3 = fText->char32At(p3);
1935
1936        if (p1 == p2) {
1937            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1938            continue;
1939        }
1940        if (p2 == fText->length()) {
1941            // Reached end of string.  Always a break position.
1942            break;
1943        }
1944
1945        // Rule  GB3   CR x LF
1946        //     No Extend or Format characters may appear between the CR and LF,
1947        //     which requires the additional check for p2 immediately following p1.
1948        //
1949        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1950            continue;
1951        }
1952
1953        // Rule (GB4).   ( Control | CR | LF ) <break>
1954        if (fControlSet->contains(c1) ||
1955            c1 == 0x0D ||
1956            c1 == 0x0A)  {
1957            break;
1958        }
1959
1960        // Rule (GB5)    <break>  ( Control | CR | LF )
1961        //
1962        if (fControlSet->contains(c2) ||
1963            c2 == 0x0D ||
1964            c2 == 0x0A)  {
1965            break;
1966        }
1967
1968
1969        // Rule (GB6)  L x ( L | V | LV | LVT )
1970        if (fLSet->contains(c1) &&
1971               (fLSet->contains(c2)  ||
1972                fVSet->contains(c2)  ||
1973                fLVSet->contains(c2) ||
1974                fLVTSet->contains(c2))) {
1975            continue;
1976        }
1977
1978        // Rule (GB7)    ( LV | V )  x  ( V | T )
1979        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1980            (fVSet->contains(c2) || fTSet->contains(c2)))  {
1981            continue;
1982        }
1983
1984        // Rule (GB8)    ( LVT | T)  x T
1985        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1986            fTSet->contains(c2))  {
1987            continue;
1988        }
1989
1990        // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1991
1992        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1993        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1994            continue;
1995        }
1996
1997        // Rule (GB9)    Numeric x ALetter
1998        if (fExtendSet->contains(c2))  {
1999            continue;
2000        }
2001
2002        // Rule (GB9a)   x  SpacingMark
2003        if (fSpacingSet->contains(c2)) {
2004            continue;
2005        }
2006
2007        // Rule (GB9b)   Prepend x
2008        if (fPrependSet->contains(c1)) {
2009            continue;
2010        }
2011
2012        // Rule (GB10)  Any  <break>  Any
2013        break;
2014    }
2015
2016    breakPos = p2;
2017    return breakPos;
2018}
2019
2020
2021
2022UVector  *RBBICharMonkey::charClasses() {
2023    return fSets;
2024}
2025
2026
2027RBBICharMonkey::~RBBICharMonkey() {
2028    delete fSets;
2029    delete fCRLFSet;
2030    delete fControlSet;
2031    delete fExtendSet;
2032    delete fRegionalIndicatorSet;
2033    delete fPrependSet;
2034    delete fSpacingSet;
2035    delete fLSet;
2036    delete fVSet;
2037    delete fTSet;
2038    delete fLVSet;
2039    delete fLVTSet;
2040    delete fHangulSet;
2041    delete fAnySet;
2042}
2043
2044//------------------------------------------------------------------------------------------
2045//
2046//   class RBBIWordMonkey      Word Break specific implementation
2047//                             of RBBIMonkeyKind.
2048//
2049//------------------------------------------------------------------------------------------
2050class RBBIWordMonkey: public RBBIMonkeyKind {
2051public:
2052    RBBIWordMonkey();
2053    virtual          ~RBBIWordMonkey();
2054    virtual  UVector *charClasses();
2055    virtual  void     setText(const UnicodeString &s);
2056    virtual int32_t   next(int32_t i);
2057private:
2058    UVector      *fSets;
2059
2060    UnicodeSet  *fCRSet;
2061    UnicodeSet  *fLFSet;
2062    UnicodeSet  *fNewlineSet;
2063    UnicodeSet  *fRegionalIndicatorSet;
2064    UnicodeSet  *fKatakanaSet;
2065    UnicodeSet  *fHebrew_LetterSet;
2066    UnicodeSet  *fALetterSet;
2067    // TODO(jungshik): Do we still need this change?
2068    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2069    UnicodeSet  *fSingle_QuoteSet;
2070    UnicodeSet  *fDouble_QuoteSet;
2071    UnicodeSet  *fMidNumLetSet;
2072    UnicodeSet  *fMidLetterSet;
2073    UnicodeSet  *fMidNumSet;
2074    UnicodeSet  *fNumericSet;
2075    UnicodeSet  *fFormatSet;
2076    UnicodeSet  *fOtherSet;
2077    UnicodeSet  *fExtendSet;
2078    UnicodeSet  *fExtendNumLetSet;
2079    UnicodeSet  *fDictionaryCjkSet;
2080
2081    const UnicodeString  *fText;
2082};
2083
2084
2085RBBIWordMonkey::RBBIWordMonkey()
2086{
2087    UErrorCode  status = U_ZERO_ERROR;
2088
2089    fSets            = new UVector(status);
2090
2091    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2092    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2093    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2094    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2095    // Exclude Hangul syllables from ALetterSet during testing.
2096    // Leave CJK dictionary characters out from the monkey tests!
2097#if 0
2098    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2099                                      "[\\p{Line_Break = Complex_Context}"
2100                                      "-\\p{Grapheme_Cluster_Break = Extend}"
2101                                      "-\\p{Grapheme_Cluster_Break = Control}"
2102                                      "]]",
2103                                      status);
2104#endif
2105    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2106    fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2107    fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2108    fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2109    fALetterSet->removeAll(*fDictionaryCjkSet);
2110    fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2111    fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2112    fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2113    fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2114    fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2115    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2116    // we should figure out why
2117    fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2118    fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2119    fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2120    fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2121
2122    fOtherSet        = new UnicodeSet();
2123    if(U_FAILURE(status)) {
2124      deferredStatus = status;
2125      return;
2126    }
2127
2128    fOtherSet->complement();
2129    fOtherSet->removeAll(*fCRSet);
2130    fOtherSet->removeAll(*fLFSet);
2131    fOtherSet->removeAll(*fNewlineSet);
2132    fOtherSet->removeAll(*fKatakanaSet);
2133    fOtherSet->removeAll(*fHebrew_LetterSet);
2134    fOtherSet->removeAll(*fALetterSet);
2135    fOtherSet->removeAll(*fSingle_QuoteSet);
2136    fOtherSet->removeAll(*fDouble_QuoteSet);
2137    fOtherSet->removeAll(*fMidLetterSet);
2138    fOtherSet->removeAll(*fMidNumSet);
2139    fOtherSet->removeAll(*fNumericSet);
2140    fOtherSet->removeAll(*fExtendNumLetSet);
2141    fOtherSet->removeAll(*fFormatSet);
2142    fOtherSet->removeAll(*fExtendSet);
2143    fOtherSet->removeAll(*fRegionalIndicatorSet);
2144    // Inhibit dictionary characters from being tested at all.
2145    fOtherSet->removeAll(*fDictionaryCjkSet);
2146    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2147
2148    fSets->addElement(fCRSet,                status);
2149    fSets->addElement(fLFSet,                status);
2150    fSets->addElement(fNewlineSet,           status);
2151    fSets->addElement(fRegionalIndicatorSet, status);
2152    fSets->addElement(fHebrew_LetterSet,     status);
2153    fSets->addElement(fALetterSet,           status);
2154    fSets->addElement(fSingle_QuoteSet,      status);
2155    fSets->addElement(fDouble_QuoteSet,      status);
2156    //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2157    fSets->addElement(fMidLetterSet,         status);
2158    fSets->addElement(fMidNumLetSet,         status);
2159    fSets->addElement(fMidNumSet,            status);
2160    fSets->addElement(fNumericSet,           status);
2161    fSets->addElement(fFormatSet,            status);
2162    fSets->addElement(fExtendSet,            status);
2163    fSets->addElement(fOtherSet,             status);
2164    fSets->addElement(fExtendNumLetSet,      status);
2165
2166    if (U_FAILURE(status)) {
2167        deferredStatus = status;
2168    }
2169}
2170
2171void RBBIWordMonkey::setText(const UnicodeString &s) {
2172    fText       = &s;
2173}
2174
2175
2176int32_t RBBIWordMonkey::next(int32_t prevPos) {
2177    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2178                              //   break position being tested.  The candidate break
2179                              //   location is before p2.
2180
2181    int     breakPos = -1;
2182
2183    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2184
2185    if (U_FAILURE(deferredStatus)) {
2186        return -1;
2187    }
2188
2189    // Prev break at end of string.  return DONE.
2190    if (prevPos >= fText->length()) {
2191        return -1;
2192    }
2193    p0 = p1 = p2 = p3 = prevPos;
2194    c3 =  fText->char32At(prevPos);
2195    c0 = c1 = c2 = 0;
2196    (void)p0;       // Suppress set but not used warning.
2197
2198    // Loop runs once per "significant" character position in the input text.
2199    for (;;) {
2200        // Move all of the positions forward in the input string.
2201        p0 = p1;  c0 = c1;
2202        p1 = p2;  c1 = c2;
2203        p2 = p3;  c2 = c3;
2204
2205        // Advancd p3 by    X(Extend | Format)*   Rule 4
2206        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2207        do {
2208            p3 = fText->moveIndex32(p3, 1);
2209            c3 = fText->char32At(p3);
2210            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2211               break;
2212            };
2213        }
2214        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2215
2216
2217        if (p1 == p2) {
2218            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2219            continue;
2220        }
2221        if (p2 == fText->length()) {
2222            // Reached end of string.  Always a break position.
2223            break;
2224        }
2225
2226        // Rule  (3)   CR x LF
2227        //     No Extend or Format characters may appear between the CR and LF,
2228        //     which requires the additional check for p2 immediately following p1.
2229        //
2230        if (c1==0x0D && c2==0x0A) {
2231            continue;
2232        }
2233
2234        // Rule (3a)  Break before and after newlines (including CR and LF)
2235        //
2236        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2237            break;
2238        };
2239        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2240            break;
2241        };
2242
2243        // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2244        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2245            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2246            continue;
2247        }
2248
2249        // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2250        //
2251        if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2252             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2253             (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2254            continue;
2255        }
2256
2257        // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2258        if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2259            (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2260            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2261            continue;
2262        }
2263
2264        // Rule (7a)     Hebrew_Letter x Single_Quote
2265        if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2266            continue;
2267        }
2268
2269        // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2270        if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2271            continue;
2272        }
2273
2274        // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2275        if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2276            continue;
2277        }
2278
2279        // Rule (8)    Numeric x Numeric
2280        if (fNumericSet->contains(c1) &&
2281            fNumericSet->contains(c2))  {
2282            continue;
2283        }
2284
2285        // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2286        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2287            fNumericSet->contains(c2))  {
2288            continue;
2289        }
2290
2291        // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2292        if (fNumericSet->contains(c1) &&
2293            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2294            continue;
2295        }
2296
2297        // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2298        if (fNumericSet->contains(c0) &&
2299            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2300            fNumericSet->contains(c2)) {
2301            continue;
2302        }
2303
2304        // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2305        if (fNumericSet->contains(c1) &&
2306            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2307            fNumericSet->contains(c3)) {
2308            continue;
2309        }
2310
2311        // Rule (13)  Katakana x Katakana
2312        if (fKatakanaSet->contains(c1) &&
2313            fKatakanaSet->contains(c2))  {
2314            continue;
2315        }
2316
2317        // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2318        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2319             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2320             fExtendNumLetSet->contains(c2)) {
2321                continue;
2322        }
2323
2324        // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2325        if (fExtendNumLetSet->contains(c1) &&
2326                (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2327                 fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2328            continue;
2329        }
2330
2331        // Rule 13c
2332        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2333            continue;
2334        }
2335
2336        // Rule 14.  Break found here.
2337        break;
2338    }
2339
2340    breakPos = p2;
2341    return breakPos;
2342}
2343
2344
2345UVector  *RBBIWordMonkey::charClasses() {
2346    return fSets;
2347}
2348
2349
2350RBBIWordMonkey::~RBBIWordMonkey() {
2351    delete fSets;
2352    delete fCRSet;
2353    delete fLFSet;
2354    delete fNewlineSet;
2355    delete fKatakanaSet;
2356    delete fHebrew_LetterSet;
2357    delete fALetterSet;
2358    delete fSingle_QuoteSet;
2359    delete fDouble_QuoteSet;
2360    delete fMidNumLetSet;
2361    delete fMidLetterSet;
2362    delete fMidNumSet;
2363    delete fNumericSet;
2364    delete fFormatSet;
2365    delete fExtendSet;
2366    delete fExtendNumLetSet;
2367    delete fRegionalIndicatorSet;
2368    delete fDictionaryCjkSet;
2369    delete fOtherSet;
2370}
2371
2372
2373
2374
2375//------------------------------------------------------------------------------------------
2376//
2377//   class RBBISentMonkey      Sentence Break specific implementation
2378//                             of RBBIMonkeyKind.
2379//
2380//------------------------------------------------------------------------------------------
2381class RBBISentMonkey: public RBBIMonkeyKind {
2382public:
2383    RBBISentMonkey();
2384    virtual          ~RBBISentMonkey();
2385    virtual  UVector *charClasses();
2386    virtual  void     setText(const UnicodeString &s);
2387    virtual int32_t   next(int32_t i);
2388private:
2389    int               moveBack(int posFrom);
2390    int               moveForward(int posFrom);
2391    UChar32           cAt(int pos);
2392
2393    UVector      *fSets;
2394
2395    UnicodeSet  *fSepSet;
2396    UnicodeSet  *fFormatSet;
2397    UnicodeSet  *fSpSet;
2398    UnicodeSet  *fLowerSet;
2399    UnicodeSet  *fUpperSet;
2400    UnicodeSet  *fOLetterSet;
2401    UnicodeSet  *fNumericSet;
2402    UnicodeSet  *fATermSet;
2403    UnicodeSet  *fSContinueSet;
2404    UnicodeSet  *fSTermSet;
2405    UnicodeSet  *fCloseSet;
2406    UnicodeSet  *fOtherSet;
2407    UnicodeSet  *fExtendSet;
2408
2409    const UnicodeString  *fText;
2410
2411};
2412
2413RBBISentMonkey::RBBISentMonkey()
2414{
2415    UErrorCode  status = U_ZERO_ERROR;
2416
2417    fSets            = new UVector(status);
2418
2419    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2420    //                       set and made into character classes of their own.  For the monkey impl,
2421    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2422    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2423    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2424    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2425    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2426    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2427    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2428    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2429    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2430    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2431    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2432    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2433    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2434    fOtherSet        = new UnicodeSet();
2435
2436    if(U_FAILURE(status)) {
2437      deferredStatus = status;
2438      return;
2439    }
2440
2441    fOtherSet->complement();
2442    fOtherSet->removeAll(*fSepSet);
2443    fOtherSet->removeAll(*fFormatSet);
2444    fOtherSet->removeAll(*fSpSet);
2445    fOtherSet->removeAll(*fLowerSet);
2446    fOtherSet->removeAll(*fUpperSet);
2447    fOtherSet->removeAll(*fOLetterSet);
2448    fOtherSet->removeAll(*fNumericSet);
2449    fOtherSet->removeAll(*fATermSet);
2450    fOtherSet->removeAll(*fSContinueSet);
2451    fOtherSet->removeAll(*fSTermSet);
2452    fOtherSet->removeAll(*fCloseSet);
2453    fOtherSet->removeAll(*fExtendSet);
2454
2455    fSets->addElement(fSepSet,       status);
2456    fSets->addElement(fFormatSet,    status);
2457    fSets->addElement(fSpSet,        status);
2458    fSets->addElement(fLowerSet,     status);
2459    fSets->addElement(fUpperSet,     status);
2460    fSets->addElement(fOLetterSet,   status);
2461    fSets->addElement(fNumericSet,   status);
2462    fSets->addElement(fATermSet,     status);
2463    fSets->addElement(fSContinueSet, status);
2464    fSets->addElement(fSTermSet,     status);
2465    fSets->addElement(fCloseSet,     status);
2466    fSets->addElement(fOtherSet,     status);
2467    fSets->addElement(fExtendSet,    status);
2468
2469    if (U_FAILURE(status)) {
2470        deferredStatus = status;
2471    }
2472}
2473
2474
2475
2476void RBBISentMonkey::setText(const UnicodeString &s) {
2477    fText       = &s;
2478}
2479
2480UVector  *RBBISentMonkey::charClasses() {
2481    return fSets;
2482}
2483
2484
2485//  moveBack()   Find the "significant" code point preceding the index i.
2486//               Skips over ($Extend | $Format)* .
2487//
2488int RBBISentMonkey::moveBack(int i) {
2489    if (i <= 0) {
2490        return -1;
2491    }
2492    UChar32   c;
2493    int32_t   j = i;
2494    do {
2495        j = fText->moveIndex32(j, -1);
2496        c = fText->char32At(j);
2497    }
2498    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2499    return j;
2500
2501 }
2502
2503
2504int RBBISentMonkey::moveForward(int i) {
2505    if (i>=fText->length()) {
2506        return fText->length();
2507    }
2508    UChar32   c;
2509    int32_t   j = i;
2510    do {
2511        j = fText->moveIndex32(j, 1);
2512        c = cAt(j);
2513    }
2514    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2515    return j;
2516}
2517
2518UChar32 RBBISentMonkey::cAt(int pos) {
2519    if (pos<0 || pos>=fText->length()) {
2520        return -1;
2521    } else {
2522        return fText->char32At(pos);
2523    }
2524}
2525
2526int32_t RBBISentMonkey::next(int32_t prevPos) {
2527    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2528                              //   break position being tested.  The candidate break
2529                              //   location is before p2.
2530
2531    int     breakPos = -1;
2532
2533    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2534    UChar32 c;
2535
2536    if (U_FAILURE(deferredStatus)) {
2537        return -1;
2538    }
2539
2540    // Prev break at end of string.  return DONE.
2541    if (prevPos >= fText->length()) {
2542        return -1;
2543    }
2544    p0 = p1 = p2 = p3 = prevPos;
2545    c3 =  fText->char32At(prevPos);
2546    c0 = c1 = c2 = 0;
2547    (void)p0;     // Suppress set but not used warning.
2548
2549    // Loop runs once per "significant" character position in the input text.
2550    for (;;) {
2551        // Move all of the positions forward in the input string.
2552        p0 = p1;  c0 = c1;
2553        p1 = p2;  c1 = c2;
2554        p2 = p3;  c2 = c3;
2555
2556        // Advancd p3 by    X(Extend | Format)*   Rule 4
2557        p3 = moveForward(p3);
2558        c3 = cAt(p3);
2559
2560        // Rule (3)  CR x LF
2561        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2562            continue;
2563        }
2564
2565        // Rule (4).   Sep  <break>
2566        if (fSepSet->contains(c1)) {
2567            p2 = p1+1;   // Separators don't combine with Extend or Format.
2568            break;
2569        }
2570
2571        if (p2 >= fText->length()) {
2572            // Reached end of string.  Always a break position.
2573            break;
2574        }
2575
2576        if (p2 == prevPos) {
2577            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2578            continue;
2579        }
2580
2581        // Rule (6).   ATerm x Numeric
2582        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2583            continue;
2584        }
2585
2586        // Rule (7).  Upper ATerm  x  Uppper
2587        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2588            continue;
2589        }
2590
2591        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2592        //           Note:  STerm | ATerm are added to the negated part of the expression by a
2593        //                  note to the Unicode 5.0 documents.
2594        int p8 = p1;
2595        while (fSpSet->contains(cAt(p8))) {
2596            p8 = moveBack(p8);
2597        }
2598        while (fCloseSet->contains(cAt(p8))) {
2599            p8 = moveBack(p8);
2600        }
2601        if (fATermSet->contains(cAt(p8))) {
2602            p8=p2;
2603            for (;;) {
2604                c = cAt(p8);
2605                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2606                    fLowerSet->contains(c) || fSepSet->contains(c) ||
2607                    fATermSet->contains(c) || fSTermSet->contains(c))  {
2608                    break;
2609                }
2610                p8 = moveForward(p8);
2611            }
2612            if (fLowerSet->contains(cAt(p8))) {
2613                continue;
2614            }
2615        }
2616
2617        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2618        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2619            p8 = p1;
2620            while (fSpSet->contains(cAt(p8))) {
2621                p8 = moveBack(p8);
2622            }
2623            while (fCloseSet->contains(cAt(p8))) {
2624                p8 = moveBack(p8);
2625            }
2626            c = cAt(p8);
2627            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2628                continue;
2629            }
2630        }
2631
2632        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2633        int p9 = p1;
2634        while (fCloseSet->contains(cAt(p9))) {
2635            p9 = moveBack(p9);
2636        }
2637        c = cAt(p9);
2638        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2639            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2640                continue;
2641            }
2642        }
2643
2644        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2645        int p10 = p1;
2646        while (fSpSet->contains(cAt(p10))) {
2647            p10 = moveBack(p10);
2648        }
2649        while (fCloseSet->contains(cAt(p10))) {
2650            p10 = moveBack(p10);
2651        }
2652        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2653            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2654                continue;
2655            }
2656        }
2657
2658        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2659        int p11 = p1;
2660        if (fSepSet->contains(cAt(p11))) {
2661            p11 = moveBack(p11);
2662        }
2663        while (fSpSet->contains(cAt(p11))) {
2664            p11 = moveBack(p11);
2665        }
2666        while (fCloseSet->contains(cAt(p11))) {
2667            p11 = moveBack(p11);
2668        }
2669        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2670            break;
2671        }
2672
2673        //  Rule (12)  Any x Any
2674        continue;
2675    }
2676    breakPos = p2;
2677    return breakPos;
2678}
2679
2680RBBISentMonkey::~RBBISentMonkey() {
2681    delete fSets;
2682    delete fSepSet;
2683    delete fFormatSet;
2684    delete fSpSet;
2685    delete fLowerSet;
2686    delete fUpperSet;
2687    delete fOLetterSet;
2688    delete fNumericSet;
2689    delete fATermSet;
2690    delete fSContinueSet;
2691    delete fSTermSet;
2692    delete fCloseSet;
2693    delete fOtherSet;
2694    delete fExtendSet;
2695}
2696
2697
2698
2699//-------------------------------------------------------------------------------------------
2700//
2701//  RBBILineMonkey
2702//
2703//-------------------------------------------------------------------------------------------
2704
2705class RBBILineMonkey: public RBBIMonkeyKind {
2706public:
2707    RBBILineMonkey();
2708    virtual          ~RBBILineMonkey();
2709    virtual  UVector *charClasses();
2710    virtual  void     setText(const UnicodeString &s);
2711    virtual  int32_t  next(int32_t i);
2712    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2713private:
2714    UVector      *fSets;
2715
2716    UnicodeSet  *fBK;
2717    UnicodeSet  *fCR;
2718    UnicodeSet  *fLF;
2719    UnicodeSet  *fCM;
2720    UnicodeSet  *fNL;
2721    UnicodeSet  *fSG;
2722    UnicodeSet  *fWJ;
2723    UnicodeSet  *fZW;
2724    UnicodeSet  *fGL;
2725    UnicodeSet  *fCB;
2726    UnicodeSet  *fSP;
2727    UnicodeSet  *fB2;
2728    UnicodeSet  *fBA;
2729    UnicodeSet  *fBB;
2730    UnicodeSet  *fHY;
2731    UnicodeSet  *fH2;
2732    UnicodeSet  *fH3;
2733    UnicodeSet  *fCL;
2734    UnicodeSet  *fCP;
2735    UnicodeSet  *fEX;
2736    UnicodeSet  *fIN;
2737    UnicodeSet  *fJL;
2738    UnicodeSet  *fJV;
2739    UnicodeSet  *fJT;
2740    UnicodeSet  *fNS;
2741    UnicodeSet  *fOP;
2742    UnicodeSet  *fQU;
2743    UnicodeSet  *fIS;
2744    UnicodeSet  *fNU;
2745    UnicodeSet  *fPO;
2746    UnicodeSet  *fPR;
2747    UnicodeSet  *fSY;
2748    UnicodeSet  *fAI;
2749    UnicodeSet  *fAL;
2750    UnicodeSet  *fCJ;
2751    UnicodeSet  *fHL;
2752    UnicodeSet  *fID;
2753    UnicodeSet  *fRI;
2754    UnicodeSet  *fSA;
2755    UnicodeSet  *fXX;
2756
2757    BreakIterator        *fCharBI;
2758    const UnicodeString  *fText;
2759    RegexMatcher         *fNumberMatcher;
2760};
2761
2762
2763RBBILineMonkey::RBBILineMonkey()
2764{
2765    UErrorCode  status = U_ZERO_ERROR;
2766
2767    fSets  = new UVector(status);
2768
2769    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2770    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2771    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2772    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2773    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2774    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2775    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2776    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2777    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2778    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2779    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2780    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2781    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2782    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2783    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2784    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2785    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2786    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2787    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2788    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2789    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2790    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2791    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2792    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2793    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2794    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2795    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2796    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2797    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2798    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2799    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2800    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2801    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2802    fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2803    fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2804    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2805    fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2806    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2807    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2808    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2809
2810    if (U_FAILURE(status)) {
2811        deferredStatus = status;
2812        fCharBI = NULL;
2813        fNumberMatcher = NULL;
2814        return;
2815    }
2816
2817    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2818    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2819    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2820    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2821
2822    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2823
2824    fSets->addElement(fBK, status);
2825    fSets->addElement(fCR, status);
2826    fSets->addElement(fLF, status);
2827    fSets->addElement(fCM, status);
2828    fSets->addElement(fNL, status);
2829    fSets->addElement(fWJ, status);
2830    fSets->addElement(fZW, status);
2831    fSets->addElement(fGL, status);
2832    fSets->addElement(fCB, status);
2833    fSets->addElement(fSP, status);
2834    fSets->addElement(fB2, status);
2835    fSets->addElement(fBA, status);
2836    fSets->addElement(fBB, status);
2837    fSets->addElement(fHY, status);
2838    fSets->addElement(fH2, status);
2839    fSets->addElement(fH3, status);
2840    fSets->addElement(fCL, status);
2841    fSets->addElement(fCP, status);
2842    fSets->addElement(fEX, status);
2843    fSets->addElement(fIN, status);
2844    fSets->addElement(fJL, status);
2845    fSets->addElement(fJT, status);
2846    fSets->addElement(fJV, status);
2847    fSets->addElement(fNS, status);
2848    fSets->addElement(fOP, status);
2849    fSets->addElement(fQU, status);
2850    fSets->addElement(fIS, status);
2851    fSets->addElement(fNU, status);
2852    fSets->addElement(fPO, status);
2853    fSets->addElement(fPR, status);
2854    fSets->addElement(fSY, status);
2855    fSets->addElement(fAI, status);
2856    fSets->addElement(fAL, status);
2857    fSets->addElement(fHL, status);
2858    fSets->addElement(fID, status);
2859    fSets->addElement(fWJ, status);
2860    fSets->addElement(fRI, status);
2861    fSets->addElement(fSA, status);
2862    fSets->addElement(fSG, status);
2863
2864    const char *rules =
2865            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2866            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2867            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2868            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2869            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2870            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2871
2872    fNumberMatcher = new RegexMatcher(
2873        UnicodeString(rules, -1, US_INV), 0, status);
2874
2875    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2876
2877    if (U_FAILURE(status)) {
2878        deferredStatus = status;
2879    }
2880}
2881
2882
2883void RBBILineMonkey::setText(const UnicodeString &s) {
2884    fText       = &s;
2885    fCharBI->setText(s);
2886    fNumberMatcher->reset(s);
2887}
2888
2889//
2890//  rule9Adjust
2891//     Line Break TR rules 9 and 10 implementation.
2892//     This deals with combining marks and other sequences that
2893//     that must be treated as if they were something other than what they actually are.
2894//
2895//     This is factored out into a separate function because it must be applied twice for
2896//     each potential break, once to the chars before the position being checked, then
2897//     again to the text following the possible break.
2898//
2899void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2900    if (pos == -1) {
2901        // Invalid initial position.  Happens during the warmup iteration of the
2902        //   main loop in next().
2903        return;
2904    }
2905
2906    int32_t  nPos = *nextPos;
2907
2908    // LB 9  Keep combining sequences together.
2909    //  advance over any CM class chars.  Note that Line Break CM is different
2910    //  from the normal Grapheme Extend property.
2911    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2912          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2913        for (;;) {
2914            *nextChar = fText->char32At(nPos);
2915            if (!fCM->contains(*nextChar)) {
2916                break;
2917            }
2918            nPos = fText->moveIndex32(nPos, 1);
2919        }
2920    }
2921
2922
2923    // LB 9 Treat X CM* as if it were x.
2924    //       No explicit action required.
2925
2926    // LB 10  Treat any remaining combining mark as AL
2927    if (fCM->contains(*posChar)) {
2928        *posChar = 0x41;   // thisChar = 'A';
2929    }
2930
2931    // Push the updated nextPos and nextChar back to our caller.
2932    // This only makes a difference if posChar got bigger by consuming a
2933    // combining sequence.
2934    *nextPos  = nPos;
2935    *nextChar = fText->char32At(nPos);
2936}
2937
2938
2939
2940int32_t RBBILineMonkey::next(int32_t startPos) {
2941    UErrorCode status = U_ZERO_ERROR;
2942    int32_t    pos;       //  Index of the char following a potential break position
2943    UChar32    thisChar;  //  Character at above position "pos"
2944
2945    int32_t    prevPos;   //  Index of the char preceding a potential break position
2946    UChar32    prevChar;  //  Character at above position.  Note that prevChar
2947                          //   and thisChar may not be adjacent because combining
2948                          //   characters between them will be ignored.
2949
2950    int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2951    UChar32    prevCharX2;
2952
2953    int32_t    nextPos;   //  Index of the next character following pos.
2954                          //     Usually skips over combining marks.
2955    int32_t    nextCPPos; //  Index of the code point following "pos."
2956                          //     May point to a combining mark.
2957    int32_t    tPos;      //  temp value.
2958    UChar32    c;
2959
2960    if (U_FAILURE(deferredStatus)) {
2961        return -1;
2962    }
2963
2964    if (startPos >= fText->length()) {
2965        return -1;
2966    }
2967
2968
2969    // Initial values for loop.  Loop will run the first time without finding breaks,
2970    //                           while the invalid values shift out and the "this" and
2971    //                           "prev" positions are filled in with good values.
2972    pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2973    thisChar = prevChar  = prevCharX2 = 0;
2974    nextPos  = nextCPPos = startPos;
2975
2976
2977    // Loop runs once per position in the test text, until a break position
2978    //  is found.
2979    for (;;) {
2980        prevPosX2 = prevPos;
2981        prevCharX2 = prevChar;
2982
2983        prevPos   = pos;
2984        prevChar  = thisChar;
2985
2986        pos       = nextPos;
2987        thisChar  = fText->char32At(pos);
2988
2989        nextCPPos = fText->moveIndex32(pos, 1);
2990        nextPos   = nextCPPos;
2991
2992        // Rule LB2 - Break at end of text.
2993        if (pos >= fText->length()) {
2994            break;
2995        }
2996
2997        // Rule LB 9 - adjust for combining sequences.
2998        //             We do this one out-of-order because the adjustment does not change anything
2999        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3000        //             be applied.
3001        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3002        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3003        c = fText->char32At(nextPos);
3004        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3005
3006        // If the loop is still warming up - if we haven't shifted the initial
3007        //   -1 positions out of prevPos yet - loop back to advance the
3008        //    position in the input without any further looking for breaks.
3009        if (prevPos == -1) {
3010            continue;
3011        }
3012
3013        // LB 4  Always break after hard line breaks,
3014        if (fBK->contains(prevChar)) {
3015            break;
3016        }
3017
3018        // LB 5  Break after CR, LF, NL, but not inside CR LF
3019        if (prevChar == 0x0d && thisChar == 0x0a) {
3020            continue;
3021        }
3022        if (prevChar == 0x0d ||
3023            prevChar == 0x0a ||
3024            prevChar == 0x85)  {
3025            break;
3026        }
3027
3028        // LB 6  Don't break before hard line breaks
3029        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030            fBK->contains(thisChar)) {
3031                continue;
3032        }
3033
3034
3035        // LB 7  Don't break before spaces or zero-width space.
3036        if (fSP->contains(thisChar)) {
3037            continue;
3038        }
3039
3040        if (fZW->contains(thisChar)) {
3041            continue;
3042        }
3043
3044        // LB 8  Break after zero width space
3045        if (fZW->contains(prevChar)) {
3046            break;
3047        }
3048
3049        // LB 9, 10  Already done, at top of loop.
3050        //
3051
3052
3053        // LB 11  Do not break before or after WORD JOINER and related characters.
3054        //    x  WJ
3055        //    WJ  x
3056        //
3057        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3058            continue;
3059        }
3060
3061        // LB 12
3062        //    GL  x
3063        if (fGL->contains(prevChar)) {
3064            continue;
3065        }
3066
3067        // LB 12a
3068        //    [^SP BA HY] x GL
3069        if (!(fSP->contains(prevChar) ||
3070              fBA->contains(prevChar) ||
3071              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3072            continue;
3073        }
3074
3075
3076
3077        // LB 13  Don't break before closings.
3078        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3079        //        fall into LB 17 and the more general number regular expression.
3080        //
3081        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3082            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3083                                         fEX->contains(thisChar)  ||
3084            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3085            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3086            continue;
3087        }
3088
3089        // LB 14 Don't break after OP SP*
3090        //       Scan backwards, checking for this sequence.
3091        //       The OP char could include combining marks, so we actually check for
3092        //           OP CM* SP*
3093        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3094        //       sequence into a ID char, so before scanning back through spaces,
3095        //       verify that prevChar is indeed a space.  The prevChar variable
3096        //       may differ from fText[prevPos]
3097        tPos = prevPos;
3098        if (fSP->contains(prevChar)) {
3099            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3100                tPos=fText->moveIndex32(tPos, -1);
3101            }
3102        }
3103        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3104            tPos=fText->moveIndex32(tPos, -1);
3105        }
3106        if (fOP->contains(fText->char32At(tPos))) {
3107            continue;
3108        }
3109
3110
3111        // LB 15    QU SP* x OP
3112        if (fOP->contains(thisChar)) {
3113            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3114            int tPos = prevPos;
3115            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3116                tPos = fText->moveIndex32(tPos, -1);
3117            }
3118            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3119                tPos = fText->moveIndex32(tPos, -1);
3120            }
3121            if (fQU->contains(fText->char32At(tPos))) {
3122                continue;
3123            }
3124        }
3125
3126
3127
3128        // LB 16   (CL | CP) SP* x NS
3129        //    Scan backwards for SP* CM* (CL | CP)
3130        if (fNS->contains(thisChar)) {
3131            int tPos = prevPos;
3132            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3133                tPos = fText->moveIndex32(tPos, -1);
3134            }
3135            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3136                tPos = fText->moveIndex32(tPos, -1);
3137            }
3138            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3139                continue;
3140            }
3141        }
3142
3143
3144        // LB 17        B2 SP* x B2
3145        if (fB2->contains(thisChar)) {
3146            //  Scan backwards, checking for the B2 CM* SP* sequence.
3147            tPos = prevPos;
3148            if (fSP->contains(prevChar)) {
3149                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3150                    tPos=fText->moveIndex32(tPos, -1);
3151                }
3152            }
3153            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3154                tPos=fText->moveIndex32(tPos, -1);
3155            }
3156            if (fB2->contains(fText->char32At(tPos))) {
3157                continue;
3158            }
3159        }
3160
3161
3162        // LB 18    break after space
3163        if (fSP->contains(prevChar)) {
3164            break;
3165        }
3166
3167        // LB 19
3168        //    x   QU
3169        //    QU  x
3170        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3171            continue;
3172        }
3173
3174        // LB 20  Break around a CB
3175        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3176            break;
3177        }
3178
3179        // LB 21
3180        if (fBA->contains(thisChar) ||
3181            fHY->contains(thisChar) ||
3182            fNS->contains(thisChar) ||
3183            fBB->contains(prevChar) )   {
3184            continue;
3185        }
3186
3187        // LB 21a
3188        //   HL (HY | BA) x
3189        if (fHL->contains(prevCharX2) &&
3190                (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3191            continue;
3192        }
3193
3194        // LB 21b
3195        //   SY x HL
3196        if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3197            continue;
3198        }
3199
3200        // LB 22
3201        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3202            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3203            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3204            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3205            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3206            continue;
3207        }
3208
3209
3210        // LB 23    ID x PO
3211        //          AL x NU
3212        //          HL x NU
3213        //          NU x AL
3214        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3215            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3216            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3217            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3218            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3219            continue;
3220        }
3221
3222        // LB 24  Do not break between prefix and letters or ideographs.
3223        //        PR x ID
3224        //        PR x (AL | HL)
3225        //        PO x (AL | HL)
3226        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3227            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3228            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3229            continue;
3230        }
3231
3232
3233
3234        // LB 25    Numbers
3235        if (fNumberMatcher->lookingAt(prevPos, status)) {
3236            if (U_FAILURE(status)) {
3237                break;
3238            }
3239            // Matched a number.  But could have been just a single digit, which would
3240            //    not represent a "no break here" between prevChar and thisChar
3241            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3242            if (numEndIdx > pos) {
3243                // Number match includes at least our two chars being checked
3244                if (numEndIdx > nextPos) {
3245                    // Number match includes additional chars.  Update pos and nextPos
3246                    //   so that next loop iteration will continue at the end of the number,
3247                    //   checking for breaks between last char in number & whatever follows.
3248                    pos = nextPos = numEndIdx;
3249                    do {
3250                        pos = fText->moveIndex32(pos, -1);
3251                        thisChar = fText->char32At(pos);
3252                    } while (fCM->contains(thisChar));
3253                }
3254                continue;
3255            }
3256        }
3257
3258
3259        // LB 26 Do not break a Korean syllable.
3260        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3261                                        fJV->contains(thisChar) ||
3262                                        fH2->contains(thisChar) ||
3263                                        fH3->contains(thisChar))) {
3264                                            continue;
3265                                        }
3266
3267        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3268            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3269                continue;
3270        }
3271
3272        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3273            fJT->contains(thisChar)) {
3274                continue;
3275        }
3276
3277        // LB 27 Treat a Korean Syllable Block the same as ID.
3278        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3279            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3280            fIN->contains(thisChar)) {
3281                continue;
3282            }
3283        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3284            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3285            fPO->contains(thisChar)) {
3286                continue;
3287            }
3288        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3289            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3290                continue;
3291            }
3292
3293
3294
3295        // LB 28  Do not break between alphabetics ("at").
3296        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3297            continue;
3298        }
3299
3300        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3301        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3302            continue;
3303        }
3304
3305        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3306        //          (AL | NU) x OP
3307        //          CP x (AL | NU)
3308        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3309            continue;
3310        }
3311        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3312            continue;
3313        }
3314
3315        // LB30a  Do not break between regional indicators.
3316        //        RI x RI
3317        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3318            continue;
3319        }
3320
3321        // LB 31    Break everywhere else
3322        break;
3323
3324    }
3325
3326    return pos;
3327}
3328
3329
3330UVector  *RBBILineMonkey::charClasses() {
3331    return fSets;
3332}
3333
3334
3335RBBILineMonkey::~RBBILineMonkey() {
3336    delete fSets;
3337
3338    delete fBK;
3339    delete fCR;
3340    delete fLF;
3341    delete fCM;
3342    delete fNL;
3343    delete fWJ;
3344    delete fZW;
3345    delete fGL;
3346    delete fCB;
3347    delete fSP;
3348    delete fB2;
3349    delete fBA;
3350    delete fBB;
3351    delete fHY;
3352    delete fH2;
3353    delete fH3;
3354    delete fCL;
3355    delete fCP;
3356    delete fEX;
3357    delete fIN;
3358    delete fJL;
3359    delete fJV;
3360    delete fJT;
3361    delete fNS;
3362    delete fOP;
3363    delete fQU;
3364    delete fIS;
3365    delete fNU;
3366    delete fPO;
3367    delete fPR;
3368    delete fSY;
3369    delete fAI;
3370    delete fAL;
3371    delete fCJ;
3372    delete fHL;
3373    delete fID;
3374    delete fRI;
3375    delete fSA;
3376    delete fSG;
3377    delete fXX;
3378
3379    delete fCharBI;
3380    delete fNumberMatcher;
3381}
3382
3383
3384//-------------------------------------------------------------------------------------------
3385//
3386//   TestMonkey
3387//
3388//     params
3389//       seed=nnnnn        Random number starting seed.
3390//                         Setting the seed allows errors to be reproduced.
3391//       loop=nnn          Looping count.  Controls running time.
3392//                         -1:  run forever.
3393//                          0 or greater:  run length.
3394//
3395//       type = char | word | line | sent | title
3396//
3397//-------------------------------------------------------------------------------------------
3398
3399static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3400    int32_t val = defaultVal;
3401    name.append(" *= *(-?\\d+)");
3402    UErrorCode status = U_ZERO_ERROR;
3403    RegexMatcher m(name, params, 0, status);
3404    if (m.find()) {
3405        // The param exists.  Convert the string to an int.
3406        char valString[100];
3407        int32_t paramLength = m.end(1, status) - m.start(1, status);
3408        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409            paramLength = (int32_t)(sizeof(valString)-2);
3410        }
3411        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412        val = strtol(valString,  NULL, 10);
3413
3414        // Delete this parameter from the params string.
3415        m.reset();
3416        params = m.replaceFirst("", status);
3417    }
3418    U_ASSERT(U_SUCCESS(status));
3419    return val;
3420}
3421#endif
3422
3423#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3424static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425                                    BreakIterator *bi,
3426                                    int expected[],
3427                                    int expectedcount)
3428{
3429    int count = 0;
3430    int i = 0;
3431    int forward[50];
3432    bi->setText(ustr);
3433    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434        forward[count] = i;
3435        if (count < expectedcount && expected[count] != i) {
3436            test->errln("break forward test failed: expected %d but got %d",
3437                        expected[count], i);
3438            break;
3439        }
3440        count ++;
3441    }
3442    if (count != expectedcount) {
3443        printStringBreaks(ustr, expected, expectedcount);
3444        test->errln("break forward test failed: missed %d match",
3445                    expectedcount - count);
3446        return;
3447    }
3448    // testing boundaries
3449    for (i = 1; i < expectedcount; i ++) {
3450        int j = expected[i - 1];
3451        if (!bi->isBoundary(j)) {
3452            printStringBreaks(ustr, expected, expectedcount);
3453            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3454            return;
3455        }
3456        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3457            if (bi->isBoundary(j)) {
3458                printStringBreaks(ustr, expected, expectedcount);
3459                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3460                return;
3461            }
3462        }
3463    }
3464
3465    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3466        count --;
3467        if (forward[count] != i) {
3468            printStringBreaks(ustr, expected, expectedcount);
3469            test->errln("happy break test previous() failed: expected %d but got %d",
3470                        forward[count], i);
3471            break;
3472        }
3473    }
3474    if (count != 0) {
3475        printStringBreaks(ustr, expected, expectedcount);
3476        test->errln("break test previous() failed: missed a match");
3477        return;
3478    }
3479
3480    // testing preceding
3481    for (i = 0; i < expectedcount - 1; i ++) {
3482        // int j = expected[i] + 1;
3483        int j = ustr.moveIndex32(expected[i], 1);
3484        for (; j <= expected[i + 1]; j ++) {
3485            if (bi->preceding(j) != expected[i]) {
3486                printStringBreaks(ustr, expected, expectedcount);
3487                test->errln("preceding(): Not expecting boundary at position %d", j);
3488                return;
3489            }
3490        }
3491    }
3492}
3493#endif
3494
3495void RBBITest::TestWordBreaks(void)
3496{
3497#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3498
3499    Locale        locale("en");
3500    UErrorCode    status = U_ZERO_ERROR;
3501    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3502    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3503    // Replaced any C+J characters in a row with a random sequence of characters
3504    // of the same length to make our C+J segmentation not get in the way.
3505    static const char *strlist[] =
3506    {
3507    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3508    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3509    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3510    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3511    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3512    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3513    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3514    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3515    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3516    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3517    "\\u2027\\U000e0067\\u0a47\\u00b7",
3518    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3519    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3520    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3521    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3522    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3523    "\\u0027\\u11af\\U000e0057\\u0602",
3524    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3525    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3526    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3527    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3528    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3529    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3530    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3531    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3532    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3533    "\\u18f4\\U000e0049\\u20e7\\u2027",
3534    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3535    "\\ua183\\u102d\\u0bec\\u003a",
3536    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3537    "\\u003a\\u0e57\\u0fad\\u002e",
3538    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3539    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3540    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3541    "\\u003a\\u0664\\u00b7\\u1fba",
3542    "\\u003b\\u0027\\u00b7\\u47a3",
3543    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3544    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3545    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3546    };
3547    int loop;
3548    if (U_FAILURE(status)) {
3549        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3550        return;
3551    }
3552    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3553        // printf("looping %d\n", loop);
3554        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3555        // RBBICharMonkey monkey;
3556        RBBIWordMonkey monkey;
3557
3558        int expected[50];
3559        int expectedcount = 0;
3560
3561        monkey.setText(ustr);
3562        int i;
3563        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3564            expected[expectedcount ++] = i;
3565        }
3566
3567        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3568    }
3569    delete bi;
3570#endif
3571}
3572
3573void RBBITest::TestWordBoundary(void)
3574{
3575    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3576    Locale        locale("en");
3577    UErrorCode    status = U_ZERO_ERROR;
3578    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3579    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3580    UChar         str[50];
3581    static const char *strlist[] =
3582    {
3583    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586    "\\u2027\\U000e0067\\u0a47\\u00b7",
3587    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592    "\\u0027\\u11af\\U000e0057\\u0602",
3593    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3598    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3599    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602    "\\u58f4\\U000e0049\\u20e7\\u2027",
3603    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3604    "\\ua183\\u102d\\u0bec\\u003a",
3605    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606    "\\u003a\\u0e57\\u0fad\\u002e",
3607    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610    "\\u003a\\u0664\\u00b7\\u1fba",
3611    "\\u003b\\u0027\\u00b7\\u47a3",
3612    };
3613    int loop;
3614    if (U_FAILURE(status)) {
3615        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3616        return;
3617    }
3618    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3619        // printf("looping %d\n", loop);
3620        u_unescape(strlist[loop], str, 20);
3621        UnicodeString ustr(str);
3622        int forward[50];
3623        int count = 0;
3624
3625        bi->setText(ustr);
3626        int prev = 0;
3627        int i;
3628        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3629            forward[count ++] = i;
3630            if (i > prev) {
3631                int j;
3632                for (j = prev + 1; j < i; j ++) {
3633                    if (bi->isBoundary(j)) {
3634                        printStringBreaks(ustr, forward, count);
3635                        errln("happy boundary test failed: expected %d not a boundary",
3636                               j);
3637                        return;
3638                    }
3639                }
3640            }
3641            if (!bi->isBoundary(i)) {
3642                printStringBreaks(ustr, forward, count);
3643                errln("happy boundary test failed: expected %d a boundary",
3644                       i);
3645                return;
3646            }
3647            prev = i;
3648        }
3649    }
3650    delete bi;
3651}
3652
3653void RBBITest::TestLineBreaks(void)
3654{
3655#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3656    Locale        locale("en");
3657    UErrorCode    status = U_ZERO_ERROR;
3658    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3659    const int32_t  STRSIZE = 50;
3660    UChar         str[STRSIZE];
3661    static const char *strlist[] =
3662    {
3663     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3664     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3665             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3666     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3667             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3668     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3669     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3670     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3671     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3672     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3673     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3674     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3690     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3691     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3692     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3693     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3694     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3695     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3696     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3697     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3698     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3699     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3700     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3701     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3702     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3703     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3704     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3705         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3706         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3707         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3708     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3709         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3710    };
3711    int loop;
3712    TEST_ASSERT_SUCCESS(status);
3713    if (U_FAILURE(status)) {
3714        return;
3715    }
3716    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3717        // printf("looping %d\n", loop);
3718        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3719        if (t >= STRSIZE) {
3720            TEST_ASSERT(FALSE);
3721            continue;
3722        }
3723
3724
3725        UnicodeString ustr(str);
3726        RBBILineMonkey monkey;
3727        if (U_FAILURE(monkey.deferredStatus)) {
3728            continue;
3729        }
3730
3731        const int EXPECTEDSIZE = 50;
3732        int expected[EXPECTEDSIZE];
3733        int expectedcount = 0;
3734
3735        monkey.setText(ustr);
3736        int i;
3737        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3738            if (expectedcount >= EXPECTEDSIZE) {
3739                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3740                return;
3741            }
3742            expected[expectedcount ++] = i;
3743        }
3744
3745        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3746    }
3747    delete bi;
3748#endif
3749}
3750
3751void RBBITest::TestSentBreaks(void)
3752{
3753#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3754    Locale        locale("en");
3755    UErrorCode    status = U_ZERO_ERROR;
3756    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3757    UChar         str[200];
3758    static const char *strlist[] =
3759    {
3760     "Now\ris\nthe\r\ntime\n\rfor\r\r",
3761     "This\n",
3762     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3763     "\"Sentence ending with a quote.\" Bye.",
3764     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3765     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3766     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3767     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3768     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3769     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3770     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3771             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3772             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3773             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3774     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3775             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3776             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3777             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3778             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3779             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3780    };
3781    int loop;
3782    if (U_FAILURE(status)) {
3783        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3784        return;
3785    }
3786    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3787        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3788        UnicodeString ustr(str);
3789
3790        RBBISentMonkey monkey;
3791        if (U_FAILURE(monkey.deferredStatus)) {
3792            continue;
3793        }
3794
3795        const int EXPECTEDSIZE = 50;
3796        int expected[EXPECTEDSIZE];
3797        int expectedcount = 0;
3798
3799        monkey.setText(ustr);
3800        int i;
3801        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3802            if (expectedcount >= EXPECTEDSIZE) {
3803                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3804                return;
3805            }
3806            expected[expectedcount ++] = i;
3807        }
3808
3809        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3810    }
3811    delete bi;
3812#endif
3813}
3814
3815void RBBITest::TestMonkey(char *params) {
3816#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3817
3818    UErrorCode     status    = U_ZERO_ERROR;
3819    int32_t        loopCount = 500;
3820    int32_t        seed      = 1;
3821    UnicodeString  breakType = "all";
3822    Locale         locale("en");
3823    UBool          useUText  = FALSE;
3824
3825    if (quick == FALSE) {
3826        loopCount = 10000;
3827    }
3828
3829    if (params) {
3830        UnicodeString p(params);
3831        loopCount = getIntParam("loop", p, loopCount);
3832        seed      = getIntParam("seed", p, seed);
3833
3834        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3835        if (m.find()) {
3836            breakType = m.group(1, status);
3837            m.reset();
3838            p = m.replaceFirst("", status);
3839        }
3840
3841        RegexMatcher u(" *utext", p, 0, status);
3842        if (u.find()) {
3843            useUText = TRUE;
3844            u.reset();
3845            p = u.replaceFirst("", status);
3846        }
3847
3848
3849        // m.reset(p);
3850        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3851            // Each option is stripped out of the option string as it is processed.
3852            // All options have been checked.  The option string should have been completely emptied..
3853            char buf[100];
3854            p.extract(buf, sizeof(buf), NULL, status);
3855            buf[sizeof(buf)-1] = 0;
3856            errln("Unrecognized or extra parameter:  %s\n", buf);
3857            return;
3858        }
3859
3860    }
3861
3862    if (breakType == "char" || breakType == "all") {
3863        RBBICharMonkey  m;
3864        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3865        if (U_SUCCESS(status)) {
3866            RunMonkey(bi, m, "char", seed, loopCount, useUText);
3867            if (breakType == "all" && useUText==FALSE) {
3868                // Also run a quick test with UText when "all" is specified
3869                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3870            }
3871        }
3872        else {
3873            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3874        }
3875        delete bi;
3876    }
3877
3878    if (breakType == "word" || breakType == "all") {
3879        logln("Word Break Monkey Test");
3880        RBBIWordMonkey  m;
3881        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3882        if (U_SUCCESS(status)) {
3883            RunMonkey(bi, m, "word", seed, loopCount, useUText);
3884        }
3885        else {
3886            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3887        }
3888        delete bi;
3889    }
3890
3891    if (breakType == "line" || breakType == "all") {
3892        logln("Line Break Monkey Test");
3893        RBBILineMonkey  m;
3894        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3895        if (loopCount >= 10) {
3896            loopCount = loopCount / 5;   // Line break runs slower than the others.
3897        }
3898        if (U_SUCCESS(status)) {
3899            RunMonkey(bi, m, "line", seed, loopCount, useUText);
3900        }
3901        else {
3902            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3903        }
3904        delete bi;
3905    }
3906
3907    if (breakType == "sent" || breakType == "all"  ) {
3908        logln("Sentence Break Monkey Test");
3909        RBBISentMonkey  m;
3910        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3911        if (loopCount >= 10) {
3912            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3913        }
3914        if (U_SUCCESS(status)) {
3915            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3916        }
3917        else {
3918            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3919        }
3920        delete bi;
3921    }
3922
3923#endif
3924}
3925
3926//
3927//  Run a RBBI monkey test.  Common routine, for all break iterator types.
3928//    Parameters:
3929//       bi      - the break iterator to use
3930//       mk      - MonkeyKind, abstraction for obtaining expected results
3931//       name    - Name of test (char, word, etc.) for use in error messages
3932//       seed    - Seed for starting random number generator (parameter from user)
3933//       numIterations
3934//
3935void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3936                         int32_t numIterations, UBool useUText) {
3937
3938#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3939
3940    const int32_t    TESTSTRINGLEN = 500;
3941    UnicodeString    testText;
3942    int32_t          numCharClasses;
3943    UVector          *chClasses;
3944    int              expected[TESTSTRINGLEN*2 + 1];
3945    int              expectedCount = 0;
3946    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3947    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3948    char             reverseBreaks[TESTSTRINGLEN*2+1];
3949    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3950    char             followingBreaks[TESTSTRINGLEN*2+1];
3951    char             precedingBreaks[TESTSTRINGLEN*2+1];
3952    int              i;
3953    int              loopCount = 0;
3954
3955    m_seed = seed;
3956
3957    numCharClasses = mk.charClasses()->size();
3958    chClasses      = mk.charClasses();
3959
3960    // Check for errors that occured during the construction of the MonkeyKind object.
3961    //  Can't report them where they occured because errln() is a method coming from intlTest,
3962    //  and is not visible outside of RBBITest :-(
3963    if (U_FAILURE(mk.deferredStatus)) {
3964        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3965        return;
3966    }
3967
3968    // Verify that the character classes all have at least one member.
3969    for (i=0; i<numCharClasses; i++) {
3970        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3971        if (s == NULL || s->size() == 0) {
3972            errln("Character Class #%d is null or of zero size.", i);
3973            return;
3974        }
3975    }
3976
3977    while (loopCount < numIterations || numIterations == -1) {
3978        if (numIterations == -1 && loopCount % 10 == 0) {
3979            // If test is running in an infinite loop, display a periodic tic so
3980            //   we can tell that it is making progress.
3981            fprintf(stderr, ".");
3982        }
3983        // Save current random number seed, so that we can recreate the random numbers
3984        //   for this loop iteration in event of an error.
3985        seed = m_seed;
3986
3987        // Populate a test string with data.
3988        testText.truncate(0);
3989        for (i=0; i<TESTSTRINGLEN; i++) {
3990            int32_t  aClassNum = m_rand() % numCharClasses;
3991            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3992            int32_t   charIdx = m_rand() % classSet->size();
3993            UChar32   c = classSet->charAt(charIdx);
3994            if (c < 0) {   // TODO:  deal with sets containing strings.
3995                errln("c < 0");
3996                break;
3997            }
3998            testText.append(c);
3999        }
4000
4001        // Calculate the expected results for this test string.
4002        mk.setText(testText);
4003        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4004        expectedBreaks[0] = 1;
4005        int32_t breakPos = 0;
4006        expectedCount = 0;
4007        for (;;) {
4008            breakPos = mk.next(breakPos);
4009            if (breakPos == -1) {
4010                break;
4011            }
4012            if (breakPos > testText.length()) {
4013                errln("breakPos > testText.length()");
4014            }
4015            expectedBreaks[breakPos] = 1;
4016            U_ASSERT(expectedCount<testText.length());
4017            expected[expectedCount ++] = breakPos;
4018            (void)expected;   // Set but not used warning.
4019                              // TODO (andy): check it out.
4020        }
4021
4022        // Find the break positions using forward iteration
4023        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4024        if (useUText) {
4025            UErrorCode status = U_ZERO_ERROR;
4026            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4027            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4028            bi->setText(testUText, status);
4029            TEST_ASSERT_SUCCESS(status);
4030            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4031                                      //  This UText can be closed immediately, so long as the
4032                                      //  testText string continues to exist.
4033        } else {
4034            bi->setText(testText);
4035        }
4036
4037        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4038            if (i < 0 || i > testText.length()) {
4039                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4040                break;
4041            }
4042            forwardBreaks[i] = 1;
4043        }
4044
4045        // Find the break positions using reverse iteration
4046        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4047        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4048            if (i < 0 || i > testText.length()) {
4049                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4050                break;
4051            }
4052            reverseBreaks[i] = 1;
4053        }
4054
4055        // Find the break positions using isBoundary() tests.
4056        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4057        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4058        for (i=0; i<=testText.length(); i++) {
4059            isBoundaryBreaks[i] = bi->isBoundary(i);
4060        }
4061
4062
4063        // Find the break positions using the following() function.
4064        // printf(".");
4065        memset(followingBreaks, 0, sizeof(followingBreaks));
4066        int32_t   lastBreakPos = 0;
4067        followingBreaks[0] = 1;
4068        for (i=0; i<testText.length(); i++) {
4069            breakPos = bi->following(i);
4070            if (breakPos <= i ||
4071                breakPos < lastBreakPos ||
4072                breakPos > testText.length() ||
4073                (breakPos > lastBreakPos && lastBreakPos > i)) {
4074                UChar32 brkChar = testText.char32At(lastBreakPos);
4075                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4076                errln("%s break monkey test: "
4077                    "Out of range value returned by BreakIterator::following().\n"
4078                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4079                         name, seed, i, breakPos, lastBreakPos);
4080                }
4081                break;
4082            }
4083            followingBreaks[breakPos] = 1;
4084            lastBreakPos = breakPos;
4085        }
4086
4087        // Find the break positions using the preceding() function.
4088        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4089        lastBreakPos = testText.length();
4090        precedingBreaks[testText.length()] = 1;
4091        for (i=testText.length(); i>0; i--) {
4092            breakPos = bi->preceding(i);
4093            if (breakPos >= i ||
4094                breakPos > lastBreakPos ||
4095                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4096                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4097                UChar32 brkChar = testText.char32At(breakPos);
4098                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4099                errln("%s break monkey test: "
4100                    "Out of range value returned by BreakIterator::preceding().\n"
4101                    "index=%d;  prev returned %d; lastBreak=%d" ,
4102                    name,  i, breakPos, lastBreakPos);
4103                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4104                    precedingBreaks[i] = 2;   // Forces an error.
4105                }
4106                }
4107            } else {
4108                if (breakPos >= 0) {
4109                    precedingBreaks[breakPos] = 1;
4110                }
4111                lastBreakPos = breakPos;
4112            }
4113        }
4114
4115        // Compare the expected and actual results.
4116        for (i=0; i<=testText.length(); i++) {
4117            const char *errorType = NULL;
4118            if  (forwardBreaks[i] != expectedBreaks[i]) {
4119                errorType = "next()";
4120            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4121                errorType = "previous()";
4122            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4123                errorType = "isBoundary()";
4124            } else if (followingBreaks[i] != expectedBreaks[i]) {
4125                errorType = "following()";
4126            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4127                errorType = "preceding()";
4128            }
4129
4130
4131            if (errorType != NULL) {
4132                // Format a range of the test text that includes the failure as
4133                //  a data item that can be included in the rbbi test data file.
4134
4135                // Start of the range is the last point where expected and actual results
4136                //   both agreed that there was a break position.
4137                int startContext = i;
4138                int32_t count = 0;
4139                for (;;) {
4140                    if (startContext==0) { break; }
4141                    startContext --;
4142                    if (expectedBreaks[startContext] != 0) {
4143                        if (count == 2) break;
4144                        count ++;
4145                    }
4146                }
4147
4148                // End of range is two expected breaks past the start position.
4149                int endContext = i + 1;
4150                int ci;
4151                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4152                    for (;;) {
4153                        if (endContext >= testText.length()) {break;}
4154                        if (expectedBreaks[endContext-1] != 0) {
4155                            if (count == 0) break;
4156                            count --;
4157                        }
4158                        endContext ++;
4159                    }
4160                }
4161
4162                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4163                UnicodeString errorText = "<data>";
4164                /***if (strcmp(errorType, "next()") == 0) {
4165                    startContext = 0;
4166                    endContext = testText.length();
4167
4168                    printStringBreaks(testText, expected, expectedCount);
4169                }***/
4170
4171                for (ci=startContext; ci<endContext;) {
4172                    UnicodeString hexChars("0123456789abcdef");
4173                    UChar32  c;
4174                    int      bn;
4175                    c = testText.char32At(ci);
4176                    if (ci == i) {
4177                        // This is the location of the error.
4178                        errorText.append("<?>");
4179                    } else if (expectedBreaks[ci] != 0) {
4180                        // This a non-error expected break position.
4181                        errorText.append("\\");
4182                    }
4183                    if (c < 0x10000) {
4184                        errorText.append("\\u");
4185                        for (bn=12; bn>=0; bn-=4) {
4186                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4187                        }
4188                    } else {
4189                        errorText.append("\\U");
4190                        for (bn=28; bn>=0; bn-=4) {
4191                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4192                        }
4193                    }
4194                    ci = testText.moveIndex32(ci, 1);
4195                }
4196                errorText.append("\\");
4197                errorText.append("</data>\n");
4198
4199                // Output the error
4200                char  charErrorTxt[500];
4201                UErrorCode status = U_ZERO_ERROR;
4202                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4203                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4204                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4205
4206                UChar32 brkChar = testText.char32At(i);
4207                if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4208                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4209                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4210                    errorType, seed, i, charErrorTxt);
4211                }
4212                break;
4213            }
4214        }
4215
4216        loopCount++;
4217    }
4218#endif
4219}
4220
4221
4222//  Bug 5532.  UTF-8 based UText fails in dictionary code.
4223//             This test checks the initial patch,
4224//             which is to just keep it from crashing.  Correct word boundaries
4225//             await a proper fix to the dictionary code.
4226//
4227void RBBITest::TestBug5532(void)  {
4228   // Text includes a mixture of Thai and Latin.
4229   const unsigned char utf8Data[] = {
4230           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4231           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4232           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4233           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4234           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4235           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4236           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4237           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4238           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4239           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4240           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4241
4242    UErrorCode status = U_ZERO_ERROR;
4243    UText utext=UTEXT_INITIALIZER;
4244    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4245    TEST_ASSERT_SUCCESS(status);
4246
4247    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4248    TEST_ASSERT_SUCCESS(status);
4249    if (U_SUCCESS(status)) {
4250        bi->setText(&utext, status);
4251        TEST_ASSERT_SUCCESS(status);
4252
4253        int32_t breakCount = 0;
4254        int32_t previousBreak = -1;
4255        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4256            // For now, just make sure that the break iterator doesn't hang.
4257            TEST_ASSERT(previousBreak < bi->current());
4258            previousBreak = bi->current();
4259        }
4260        TEST_ASSERT(breakCount > 0);
4261    }
4262    delete bi;
4263    utext_close(&utext);
4264}
4265
4266
4267void RBBITest::TestBug9983(void)  {
4268    UnicodeString text = UnicodeString("\\u002A"  // * Other
4269                                       "\\uFF65"  //   Other
4270                                       "\\u309C"  //   Katakana
4271                                       "\\uFF9F"  //   Extend
4272                                       "\\uFF65"  //   Other
4273                                       "\\u0020"  //   Other
4274                                       "\\u0000").unescape();
4275
4276    UErrorCode status = U_ZERO_ERROR;
4277    LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4278        BreakIterator::createWordInstance(Locale::getRoot(), status)));
4279    TEST_ASSERT_SUCCESS(status);
4280    LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4281        BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4282    TEST_ASSERT_SUCCESS(status);
4283    if (U_FAILURE(status)) {
4284        return;
4285    }
4286    int32_t offset, rstatus, iterationCount;
4287
4288    brkiter->setText(text);
4289    brkiter->last();
4290    iterationCount = 0;
4291    while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4292        iterationCount++;
4293        rstatus = brkiter->getRuleStatus();
4294        (void)rstatus;     // Suppress set but not used warning.
4295        if (iterationCount >= 10) {
4296           break;
4297        }
4298    }
4299    TEST_ASSERT(iterationCount == 6);
4300
4301    brkiterPOSIX->setText(text);
4302    brkiterPOSIX->last();
4303    iterationCount = 0;
4304    while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4305        iterationCount++;
4306        rstatus = brkiterPOSIX->getRuleStatus();
4307        (void)rstatus;     // Suppress set but not used warning.
4308        if (iterationCount >= 10) {
4309           break;
4310        }
4311    }
4312    TEST_ASSERT(iterationCount == 6);
4313}
4314
4315
4316//
4317//  TestDebug    -  A place-holder test for debugging purposes.
4318//                  For putting in fragments of other tests that can be invoked
4319//                  for tracing  without a lot of unwanted extra stuff happening.
4320//
4321void RBBITest::TestDebug(void) {
4322#if 0
4323    UErrorCode   status = U_ZERO_ERROR;
4324    int pos = 0;
4325    int ruleStatus = 0;
4326
4327    RuleBasedBreakIterator* bi =
4328       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4329       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4330       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4331    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4332    // UnicodeString s("Aaa.  Bcd");
4333    s = s.unescape();
4334    bi->setText(s);
4335    UBool r = bi->isBoundary(8);
4336    printf("%s", r?"true":"false");
4337    return;
4338    pos = bi->last();
4339    do {
4340        // ruleStatus = bi->getRuleStatus();
4341        printf("%d\t%d\n", pos, ruleStatus);
4342        pos = bi->previous();
4343    } while (pos != BreakIterator::DONE);
4344#endif
4345}
4346
4347void RBBITest::TestProperties() {
4348    UErrorCode errorCode = U_ZERO_ERROR;
4349    UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4350    if (!prependSet.isEmpty()) {
4351        errln(
4352            "[:GCB=Prepend:] is not empty any more. "
4353            "Uncomment relevant lines in source/data/brkitr/char.txt and "
4354            "change this test to the opposite condition.");
4355    }
4356}
4357
4358#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4359