1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/************************************************************************ 7* Date Name Description 8* 12/15/99 Madhu Creation. 9* 01/12/2000 Madhu Updated for changed API and added new tests 10************************************************************************/ 11 12#include "utypeinfo.h" // for 'typeid' to work 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_BREAK_ITERATION 17 18#include "unicode/utypes.h" 19#include "unicode/brkiter.h" 20#include "unicode/rbbi.h" 21#include "unicode/uchar.h" 22#include "unicode/utf16.h" 23#include "unicode/ucnv.h" 24#include "unicode/schriter.h" 25#include "unicode/uniset.h" 26#if !UCONFIG_NO_REGULAR_EXPRESSIONS 27#include "unicode/regex.h" 28#endif 29#include "unicode/ustring.h" 30#include "unicode/utext.h" 31#include "intltest.h" 32#include "rbbitst.h" 33#include <string.h> 34#include "uvector.h" 35#include "uvectr32.h" 36#include <string.h> 37#include <stdio.h> 38#include <stdlib.h> 39#include "unicode/numfmt.h" 40#include "unicode/uscript.h" 41 42#define TEST_ASSERT(x) {if (!(x)) { \ 43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 44 45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 47 48 49//--------------------------------------------- 50// runIndexedTest 51//--------------------------------------------- 52 53 54// Note: Before adding new tests to this file, check whether the desired test data can 55// simply be added to the file testdata/rbbitest.txt. In most cases it can, 56// it's much less work than writing a new test, diagnostic output in the event of failures 57// is good, and the test data file will is shared with ICU4J, so eventually the test 58// will run there as well, without additional effort. 59 60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 61{ 62 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 63 64 switch (index) { 65#if !UCONFIG_NO_FILE_IO 66 case 0: name = "TestBug4153072"; 67 if(exec) TestBug4153072(); break; 68#else 69 case 0: name = "skip"; 70 break; 71#endif 72 73 case 1: name = "skip"; 74 break; 75 case 2: name = "TestStatusReturn"; 76 if(exec) TestStatusReturn(); break; 77 78#if !UCONFIG_NO_FILE_IO 79 case 3: name = "TestUnicodeFiles"; 80 if(exec) TestUnicodeFiles(); break; 81 case 4: name = "TestEmptyString"; 82 if(exec) TestEmptyString(); break; 83#else 84 case 3: case 4: name = "skip"; 85 break; 86#endif 87 88 case 5: name = "TestGetAvailableLocales"; 89 if(exec) TestGetAvailableLocales(); break; 90 91 case 6: name = "TestGetDisplayName"; 92 if(exec) TestGetDisplayName(); break; 93 94#if !UCONFIG_NO_FILE_IO 95 case 7: name = "TestEndBehaviour"; 96 if(exec) TestEndBehaviour(); break; 97 case 8: case 9: case 10: name = "skip"; 98 break; 99 case 11: name = "TestWordBreaks"; 100 if(exec) TestWordBreaks(); break; 101 case 12: name = "TestWordBoundary"; 102 if(exec) TestWordBoundary(); break; 103 case 13: name = "TestLineBreaks"; 104 if(exec) TestLineBreaks(); break; 105 case 14: name = "TestSentBreaks"; 106 if(exec) TestSentBreaks(); break; 107 case 15: name = "TestExtended"; 108 if(exec) TestExtended(); break; 109#else 110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 111 break; 112#endif 113 114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 115 case 16: 116 name = "TestMonkey"; if(exec) TestMonkey(params); break; 117#else 118 case 16: 119 name = "skip"; break; 120#endif 121 122#if !UCONFIG_NO_FILE_IO 123 case 17: name = "TestBug3818"; 124 if(exec) TestBug3818(); break; 125#else 126 case 17: name = "skip"; 127 break; 128#endif 129 130 case 18: name = "skip"; 131 break; 132 case 19: name = "TestDebug"; 133 if(exec) TestDebug(); break; 134 case 20: name = "skip"; 135 break; 136 137#if !UCONFIG_NO_FILE_IO 138 case 21: name = "TestBug5775"; 139 if (exec) TestBug5775(); break; 140#else 141 case 21: name = "skip"; 142 break; 143#endif 144 145 case 22: name = "TestBug9983"; 146 if (exec) TestBug9983(); break; 147 case 23: name = "TestDictRules"; 148 if (exec) TestDictRules(); break; 149 case 24: name = "TestBug5532"; 150 if (exec) TestBug5532(); break; 151 default: name = ""; break; //needed to end loop 152 } 153} 154 155 156//--------------------------------------------------------------------------- 157// 158// class BITestData Holds a set of Break iterator test data and results 159// Includes 160// - the string data to be broken 161// - a vector of the expected break positions. 162// - a vector of source line numbers for the data, 163// (to help see where errors occured.) 164// - The expected break tag values. 165// - Vectors of actual break positions and tag values. 166// - Functions for comparing actual with expected and 167// reporting errors. 168// 169//---------------------------------------------------------------------------- 170class BITestData { 171public: 172 UnicodeString fDataToBreak; 173 UVector fExpectedBreakPositions; 174 UVector fExpectedTags; 175 UVector fLineNum; 176 UVector fActualBreakPositions; // Test Results. 177 UVector fActualTags; 178 179 BITestData(UErrorCode &status); 180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 181 void checkResults(const char *heading, RBBITest *test); 182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 183 void clearResults(); 184}; 185 186// 187// Constructor. 188// 189BITestData::BITestData(UErrorCode &status) 190: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 191 fActualTags(status) 192{ 193} 194 195// 196// addDataChunk. Add a section (non-breaking) piece if data to the test data. 197// The macro form collects the line number, which is helpful 198// when tracking down failures. 199// 200// A null data item is inserted at the start of each test's data 201// to put the starting zero into the data list. The position saved for 202// each non-null item is its ending position. 203// 204#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 206 if (U_FAILURE(status)) {return;} 207 if (data != NULL) { 208 fDataToBreak.append(CharsToUnicodeString(data)); 209 } 210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 211 fExpectedTags.addElement(tag, status); 212 fLineNum.addElement(lineNum, status); 213} 214 215 216// 217// checkResults. Compare the actual and expected break positions, report any differences. 218// 219void BITestData::checkResults(const char *heading, RBBITest *test) { 220 int32_t expectedIndex = 0; 221 int32_t actualIndex = 0; 222 223 for (;;) { 224 // If we've run through both the expected and actual results vectors, we're done. 225 // break out of the loop. 226 if (expectedIndex >= fExpectedBreakPositions.size() && 227 actualIndex >= fActualBreakPositions.size()) { 228 break; 229 } 230 231 232 if (expectedIndex >= fExpectedBreakPositions.size()) { 233 err(heading, test, expectedIndex-1, actualIndex); 234 actualIndex++; 235 continue; 236 } 237 238 if (actualIndex >= fActualBreakPositions.size()) { 239 err(heading, test, expectedIndex, actualIndex-1); 240 expectedIndex++; 241 continue; 242 } 243 244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 245 err(heading, test, expectedIndex, actualIndex); 246 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 248 actualIndex++; 249 } else { 250 expectedIndex++; 251 } 252 continue; 253 } 254 255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 257 heading, fLineNum.elementAt(expectedIndex), 258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 259 } 260 261 actualIndex++; 262 expectedIndex++; 263 } 264} 265 266// 267// err - An error was found. Report it, along with information about where the 268// incorrectly broken test data appeared in the source file. 269// 270void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 271{ 272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 273 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 274 int32_t o = 0; 275 int32_t line = fLineNum.elementAti(expectedIdx); 276 if (expectedIdx > 0) { 277 // The line numbers are off by one because a premature break occurs somewhere 278 // within the previous item, rather than at the start of the current (expected) item. 279 // We want to report the offset of the unexpected break from the start of 280 // this previous item. 281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 282 } 283 if (actual < expected) { 284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 285 } else { 286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 287 } 288} 289 290 291void BITestData::clearResults() { 292 fActualBreakPositions.removeAllElements(); 293 fActualTags.removeAllElements(); 294} 295 296 297//-------------------------------------------------------------------------------------- 298// 299// RBBITest constructor and destructor 300// 301//-------------------------------------------------------------------------------------- 302 303RBBITest::RBBITest() { 304} 305 306 307RBBITest::~RBBITest() { 308} 309 310//----------------------------------------------------------------------------------- 311// 312// Test for status {tag} return value from break rules. 313// TODO: a more thorough test. 314// 315//----------------------------------------------------------------------------------- 316void RBBITest::TestStatusReturn() { 317 UnicodeString rulesString1("$Letters = [:L:];\n" 318 "$Numbers = [:N:];\n" 319 "$Letters+{1};\n" 320 "$Numbers+{2};\n" 321 "Help\\ {4}/me\\!;\n" 322 "[^$Letters $Numbers];\n" 323 "!.*;\n", -1, US_INV); 324 UnicodeString testString1 = "abc123..abc Help me Help me!"; 325 // 01234567890123456789012345678 326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 328 329 UErrorCode status=U_ZERO_ERROR; 330 UParseError parseError; 331 332 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 333 if(U_FAILURE(status)) { 334 dataerrln("FAIL : in construction - %s", u_errorName(status)); 335 } else { 336 int32_t pos; 337 int32_t i = 0; 338 bi->setText(testString1); 339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 340 if (pos != bounds1[i]) { 341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 342 break; 343 } 344 345 int tag = bi->getRuleStatus(); 346 if (tag != brkStatus[i]) { 347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 348 break; 349 } 350 i++; 351 } 352 } 353 delete bi; 354} 355 356 357static void printStringBreaks(UnicodeString ustr, int expected[], 358 int expectedcount) 359{ 360 UErrorCode status = U_ZERO_ERROR; 361 char name[100]; 362 printf("code alpha extend alphanum type word sent line name\n"); 363 int j; 364 for (j = 0; j < ustr.length(); j ++) { 365 if (expectedcount > 0) { 366 int k; 367 for (k = 0; k < expectedcount; k ++) { 368 if (j == expected[k]) { 369 printf("------------------------------------------------ %d\n", 370 j); 371 } 372 } 373 } 374 UChar32 c = ustr.char32At(j); 375 if (c > 0xffff) { 376 j ++; 377 } 378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 380 u_isUAlphabetic(c), 381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 382 u_isalnum(c), 383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 384 u_charType(c), 385 U_SHORT_PROPERTY_NAME), 386 u_getPropertyValueName(UCHAR_WORD_BREAK, 387 u_getIntPropertyValue(c, 388 UCHAR_WORD_BREAK), 389 U_SHORT_PROPERTY_NAME), 390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 391 u_getIntPropertyValue(c, 392 UCHAR_SENTENCE_BREAK), 393 U_SHORT_PROPERTY_NAME), 394 u_getPropertyValueName(UCHAR_LINE_BREAK, 395 u_getIntPropertyValue(c, 396 UCHAR_LINE_BREAK), 397 U_SHORT_PROPERTY_NAME), 398 name); 399 } 400} 401 402 403void RBBITest::TestBug3818() { 404 UErrorCode status = U_ZERO_ERROR; 405 406 // Four Thai words... 407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 409 UnicodeString thaiStr(thaiWordData); 410 411 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 412 if (U_FAILURE(status) || bi == NULL) { 413 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 414 return; 415 } 416 bi->setText(thaiStr); 417 418 int32_t startOfSecondWord = bi->following(1); 419 if (startOfSecondWord != 4) { 420 errln("Fail at file %s, line %d expected start of word at 4, got %d", 421 __FILE__, __LINE__, startOfSecondWord); 422 } 423 startOfSecondWord = bi->following(0); 424 if (startOfSecondWord != 4) { 425 errln("Fail at file %s, line %d expected start of word at 4, got %d", 426 __FILE__, __LINE__, startOfSecondWord); 427 } 428 delete bi; 429} 430 431//---------------------------------------------------------------------------- 432// 433// generalIteratorTest Given a break iterator and a set of test data, 434// Run the tests and report the results. 435// 436//---------------------------------------------------------------------------- 437void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 438{ 439 440 bi.setText(td.fDataToBreak); 441 442 testFirstAndNext(bi, td); 443 444 testLastAndPrevious(bi, td); 445 446 testFollowing(bi, td); 447 testPreceding(bi, td); 448 testIsBoundary(bi, td); 449 doMultipleSelectionTest(bi, td); 450} 451 452 453// 454// testFirstAndNext. Run the iterator forwards in the obvious first(), next() 455// kind of loop. 456// 457void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 458{ 459 UErrorCode status = U_ZERO_ERROR; 460 int32_t p; 461 int32_t lastP = -1; 462 int32_t tag; 463 464 logln("Test first and next"); 465 bi.setText(td.fDataToBreak); 466 td.clearResults(); 467 468 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 469 td.fActualBreakPositions.addElement(p, status); // Save result. 470 tag = bi.getRuleStatus(); 471 td.fActualTags.addElement(tag, status); 472 if (p <= lastP) { 473 // If the iterator is not making forward progress, stop. 474 // No need to raise an error here, it'll be detected in the normal check of results. 475 break; 476 } 477 lastP = p; 478 } 479 td.checkResults("testFirstAndNext", this); 480} 481 482 483// 484// TestLastAndPrevious. Run the iterator backwards, starting with last(). 485// 486void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 487{ 488 UErrorCode status = U_ZERO_ERROR; 489 int32_t p; 490 int32_t lastP = 0x7ffffffe; 491 int32_t tag; 492 493 logln("Test last and previous"); 494 bi.setText(td.fDataToBreak); 495 td.clearResults(); 496 497 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 498 // Save break position. Insert it at start of vector of results, shoving 499 // already-saved results further towards the end. 500 td.fActualBreakPositions.insertElementAt(p, 0, status); 501 // bi.previous(); // TODO: Why does this fix things up???? 502 // bi.next(); 503 tag = bi.getRuleStatus(); 504 td.fActualTags.insertElementAt(tag, 0, status); 505 if (p >= lastP) { 506 // If the iterator is not making progress, stop. 507 // No need to raise an error here, it'll be detected in the normal check of results. 508 break; 509 } 510 lastP = p; 511 } 512 td.checkResults("testLastAndPrevious", this); 513} 514 515 516void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 517{ 518 UErrorCode status = U_ZERO_ERROR; 519 int32_t p; 520 int32_t tag; 521 int32_t lastP = -2; // A value that will never be returned as a break position. 522 // cannot be -1; that is returned for DONE. 523 int i; 524 525 logln("testFollowing():"); 526 bi.setText(td.fDataToBreak); 527 td.clearResults(); 528 529 // Save the starting point, since we won't get that out of following. 530 p = bi.first(); 531 td.fActualBreakPositions.addElement(p, status); // Save result. 532 tag = bi.getRuleStatus(); 533 td.fActualTags.addElement(tag, status); 534 535 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 536 p = bi.following(i); 537 if (p != lastP) { 538 if (p == RuleBasedBreakIterator::DONE) { 539 break; 540 } 541 // We've reached a new break position. Save it. 542 td.fActualBreakPositions.addElement(p, status); // Save result. 543 tag = bi.getRuleStatus(); 544 td.fActualTags.addElement(tag, status); 545 lastP = p; 546 } 547 } 548 // The loop normally exits by means of the break in the middle. 549 // Make sure that the index was at the correct position for the break iterator to have 550 // returned DONE. 551 if (i != td.fDataToBreak.length()) { 552 errln("testFollowing(): iterator returned DONE prematurely."); 553 } 554 555 // Full check of all results. 556 td.checkResults("testFollowing", this); 557} 558 559 560 561void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 562 UErrorCode status = U_ZERO_ERROR; 563 int32_t p; 564 int32_t tag; 565 int32_t lastP = 0x7ffffffe; 566 int i; 567 568 logln("testPreceding():"); 569 bi.setText(td.fDataToBreak); 570 td.clearResults(); 571 572 p = bi.last(); 573 td.fActualBreakPositions.addElement(p, status); 574 tag = bi.getRuleStatus(); 575 td.fActualTags.addElement(tag, status); 576 577 for (i = td.fDataToBreak.length(); i>=-1; i--) { 578 p = bi.preceding(i); 579 if (p != lastP) { 580 if (p == RuleBasedBreakIterator::DONE) { 581 break; 582 } 583 // We've reached a new break position. Save it. 584 td.fActualBreakPositions.insertElementAt(p, 0, status); 585 lastP = p; 586 tag = bi.getRuleStatus(); 587 td.fActualTags.insertElementAt(tag, 0, status); 588 } 589 } 590 // The loop normally exits by means of the break in the middle. 591 // Make sure that the index was at the correct position for the break iterator to have 592 // returned DONE. 593 if (i != 0) { 594 errln("testPreceding(): iterator returned DONE prematurely."); 595 } 596 597 // Full check of all results. 598 td.checkResults("testPreceding", this); 599} 600 601 602 603void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 604 UErrorCode status = U_ZERO_ERROR; 605 int i; 606 int32_t tag; 607 608 logln("testIsBoundary():"); 609 bi.setText(td.fDataToBreak); 610 td.clearResults(); 611 612 for (i = 0; i <= td.fDataToBreak.length(); i++) { 613 if (bi.isBoundary(i)) { 614 td.fActualBreakPositions.addElement(i, status); // Save result. 615 tag = bi.getRuleStatus(); 616 td.fActualTags.addElement(tag, status); 617 } 618 } 619 td.checkResults("testIsBoundary: ", this); 620} 621 622 623 624void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 625{ 626 iterator.setText(td.fDataToBreak); 627 628 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 629 int32_t offset = iterator.first(); 630 int32_t testOffset; 631 int32_t count = 0; 632 633 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 634 635 if (*testIterator != iterator) 636 errln("clone() or operator!= failed: two clones compared unequal"); 637 638 do { 639 testOffset = testIterator->first(); 640 testOffset = testIterator->next(count); 641 if (offset != testOffset) 642 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 643 644 if (offset != RuleBasedBreakIterator::DONE) { 645 count++; 646 offset = iterator.next(); 647 648 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 649 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 650 if (count > 10000 || offset == -1) { 651 errln("operator== failed too many times. Stopping test."); 652 if (offset == -1) { 653 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 654 } 655 return; 656 } 657 } 658 } 659 } while (offset != RuleBasedBreakIterator::DONE); 660 661 // now do it backwards... 662 offset = iterator.last(); 663 count = 0; 664 665 do { 666 testOffset = testIterator->last(); 667 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 668 if (offset != testOffset) 669 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 670 671 if (offset != RuleBasedBreakIterator::DONE) { 672 count--; 673 offset = iterator.previous(); 674 } 675 } while (offset != RuleBasedBreakIterator::DONE); 676 677 delete testIterator; 678} 679 680 681//--------------------------------------------- 682// 683// other tests 684// 685//--------------------------------------------- 686void RBBITest::TestEmptyString() 687{ 688 UnicodeString text = ""; 689 UErrorCode status = U_ZERO_ERROR; 690 691 BITestData x(status); 692 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 693 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 694 if (U_FAILURE(status)) 695 { 696 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 697 return; 698 } 699 generalIteratorTest(*bi, x); 700 delete bi; 701} 702 703void RBBITest::TestGetAvailableLocales() 704{ 705 int32_t locCount = 0; 706 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 707 708 if (locCount == 0) 709 dataerrln("getAvailableLocales() returned an empty list!"); 710 // Just make sure that it's returning good memory. 711 int32_t i; 712 for (i = 0; i < locCount; ++i) { 713 logln(locList[i].getName()); 714 } 715} 716 717//Testing the BreakIterator::getDisplayName() function 718void RBBITest::TestGetDisplayName() 719{ 720 UnicodeString result; 721 722 BreakIterator::getDisplayName(Locale::getUS(), result); 723 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 724 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 725 + result); 726 727 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 728 if (result != "French (France)") 729 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 730 + result); 731} 732/** 733 * Test End Behaviour 734 * @bug 4068137 735 */ 736void RBBITest::TestEndBehaviour() 737{ 738 UErrorCode status = U_ZERO_ERROR; 739 UnicodeString testString("boo."); 740 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 741 if (U_FAILURE(status)) 742 { 743 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 744 return; 745 } 746 wb->setText(testString); 747 748 if (wb->first() != 0) 749 errln("Didn't get break at beginning of string."); 750 if (wb->next() != 3) 751 errln("Didn't get break before period in \"boo.\""); 752 if (wb->current() != 4 && wb->next() != 4) 753 errln("Didn't get break at end of string."); 754 delete wb; 755} 756/* 757 * @bug 4153072 758 */ 759void RBBITest::TestBug4153072() { 760 UErrorCode status = U_ZERO_ERROR; 761 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 762 if (U_FAILURE(status)) 763 { 764 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 765 return; 766 } 767 UnicodeString str("...Hello, World!..."); 768 int32_t begin = 3; 769 int32_t end = str.length() - 3; 770 UBool onBoundary; 771 772 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 773 iter->adoptText(textIterator); 774 int index; 775 // Note: with the switch to UText, there is no way to restrict the 776 // iteration range to begin at an index other than zero. 777 // String character iterators created with a non-zero bound are 778 // treated by RBBI as being empty. 779 for (index = -1; index < begin + 1; ++index) { 780 onBoundary = iter->isBoundary(index); 781 if (index == 0? !onBoundary : onBoundary) { 782 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 783 " and begin index = " + begin); 784 } 785 } 786 delete iter; 787} 788 789 790// 791// Test for problem reported by Ashok Matoria on 9 July 2007 792// One.<kSoftHyphen><kSpace>Two. 793// 794// Sentence break at start (0) and then on calling next() it breaks at 795// 'T' of "Two". Now, at this point if I do next() and 796// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 797// 798void RBBITest::TestBug5775() { 799 UErrorCode status = U_ZERO_ERROR; 800 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 801 TEST_ASSERT_SUCCESS(status); 802 if (U_FAILURE(status)) { 803 return; 804 } 805// Check for status first for better handling of no data errors. 806 TEST_ASSERT(bi != NULL); 807 if (bi == NULL) { 808 return; 809 } 810 811 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 812 // 01234 56789 813 s = s.unescape(); 814 bi->setText(s); 815 int pos = bi->next(); 816 TEST_ASSERT(pos == 6); 817 pos = bi->next(); 818 TEST_ASSERT(pos == 10); 819 pos = bi->previous(); 820 TEST_ASSERT(pos == 6); 821 delete bi; 822} 823 824 825 826//------------------------------------------------------------------------------ 827// 828// RBBITest::Extended Run RBBI Tests from an external test data file 829// 830//------------------------------------------------------------------------------ 831 832struct TestParams { 833 BreakIterator *bi; 834 UnicodeString dataToBreak; 835 UVector32 *expectedBreaks; 836 UVector32 *srcLine; 837 UVector32 *srcCol; 838}; 839 840void RBBITest::executeTest(TestParams *t) { 841 int32_t bp; 842 int32_t prevBP; 843 int32_t i; 844 845 if (t->bi == NULL) { 846 return; 847 } 848 849 t->bi->setText(t->dataToBreak); 850 // 851 // Run the iterator forward 852 // 853 prevBP = -1; 854 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 855 if (prevBP == bp) { 856 // Fail for lack of forward progress. 857 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 858 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 859 break; 860 } 861 862 // Check that there were we didn't miss an expected break between the last one 863 // and this one. 864 for (i=prevBP+1; i<bp; i++) { 865 if (t->expectedBreaks->elementAti(i) != 0) { 866 int expected[] = {0, i}; 867 printStringBreaks(t->dataToBreak, expected, 2); 868 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 869 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 870 } 871 } 872 873 // Check that the break we did find was expected 874 if (t->expectedBreaks->elementAti(bp) == 0) { 875 int expected[] = {0, bp}; 876 printStringBreaks(t->dataToBreak, expected, 2); 877 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 878 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 879 } else { 880 // The break was expected. 881 // Check that the {nnn} tag value is correct. 882 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 883 if (expectedTagVal == -1) { 884 expectedTagVal = 0; 885 } 886 int32_t line = t->srcLine->elementAti(bp); 887 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 888 if (rs != expectedTagVal) { 889 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 890 " Actual, Expected status = %4d, %4d", 891 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 892 } 893 } 894 895 896 prevBP = bp; 897 } 898 899 // Verify that there were no missed expected breaks after the last one found 900 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 901 if (t->expectedBreaks->elementAti(i) != 0) { 902 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 903 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 904 } 905 } 906 907 // 908 // Run the iterator backwards, verify that the same breaks are found. 909 // 910 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 911 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 912 if (prevBP == bp) { 913 // Fail for lack of progress. 914 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 915 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 916 break; 917 } 918 919 // Check that there were we didn't miss an expected break between the last one 920 // and this one. (UVector returns zeros for index out of bounds.) 921 for (i=prevBP-1; i>bp; i--) { 922 if (t->expectedBreaks->elementAti(i) != 0) { 923 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 924 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 925 } 926 } 927 928 // Check that the break we did find was expected 929 if (t->expectedBreaks->elementAti(bp) == 0) { 930 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 931 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 932 } else { 933 // The break was expected. 934 // Check that the {nnn} tag value is correct. 935 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 936 if (expectedTagVal == -1) { 937 expectedTagVal = 0; 938 } 939 int line = t->srcLine->elementAti(bp); 940 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 941 if (rs != expectedTagVal) { 942 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 943 " Actual, Expected status = %4d, %4d", 944 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 945 } 946 } 947 948 prevBP = bp; 949 } 950 951 // Verify that there were no missed breaks prior to the last one found 952 for (i=prevBP-1; i>=0; i--) { 953 if (t->expectedBreaks->elementAti(i) != 0) { 954 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 955 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 956 } 957 } 958 959 // Check isBoundary() 960 for (i=0; i<t->expectedBreaks->size(); i++) { 961 UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0); 962 UBool boundaryFound = t->bi->isBoundary(i); 963 if (boundaryExpected != boundaryFound) { 964 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 965 " Expected, Actual= %s, %s", 966 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), 967 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 968 } 969 } 970 971 // Check following() 972 for (i=0; i<t->expectedBreaks->size(); i++) { 973 int32_t actualBreak = t->bi->following(i); 974 int32_t expectedBreak = BreakIterator::DONE; 975 for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) { 976 if (t->expectedBreaks->elementAti(j) != 0) { 977 expectedBreak = j; 978 break; 979 } 980 } 981 if (expectedBreak != actualBreak) { 982 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 983 " Expected, Actual= %d, %d", 984 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 985 } 986 } 987 988 // Check preceding() 989 for (i=t->expectedBreaks->size(); i>=0; i--) { 990 int32_t actualBreak = t->bi->preceding(i); 991 int32_t expectedBreak = BreakIterator::DONE; 992 993 for (int32_t j=i-1; j >= 0; j--) { 994 if (t->expectedBreaks->elementAti(j) != 0) { 995 expectedBreak = j; 996 break; 997 } 998 } 999 if (expectedBreak != actualBreak) { 1000 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1001 " Expected, Actual= %d, %d", 1002 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 1003 } 1004 } 1005} 1006 1007 1008void RBBITest::TestExtended() { 1009#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1010 UErrorCode status = U_ZERO_ERROR; 1011 Locale locale(""); 1012 1013 UnicodeString rules; 1014 TestParams tp; 1015 tp.bi = NULL; 1016 tp.expectedBreaks = new UVector32(status); 1017 tp.srcLine = new UVector32(status); 1018 tp.srcCol = new UVector32(status); 1019 1020 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1021 if (U_FAILURE(status)) { 1022 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1023 } 1024 1025 1026 // 1027 // Open and read the test data file. 1028 // 1029 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1030 char testFileName[1000]; 1031 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1032 errln("Can't open test data. Path too long."); 1033 return; 1034 } 1035 strcpy(testFileName, testDataDirectory); 1036 strcat(testFileName, "rbbitst.txt"); 1037 1038 int len; 1039 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1040 if (U_FAILURE(status)) { 1041 return; /* something went wrong, error already output */ 1042 } 1043 1044 1045 1046 1047 // 1048 // Put the test data into a UnicodeString 1049 // 1050 UnicodeString testString(FALSE, testFile, len); 1051 1052 enum EParseState{ 1053 PARSE_COMMENT, 1054 PARSE_TAG, 1055 PARSE_DATA, 1056 PARSE_NUM 1057 } 1058 parseState = PARSE_TAG; 1059 1060 EParseState savedState = PARSE_TAG; 1061 1062 static const UChar CH_LF = 0x0a; 1063 static const UChar CH_CR = 0x0d; 1064 static const UChar CH_HASH = 0x23; 1065 /*static const UChar CH_PERIOD = 0x2e;*/ 1066 static const UChar CH_LT = 0x3c; 1067 static const UChar CH_GT = 0x3e; 1068 static const UChar CH_BACKSLASH = 0x5c; 1069 static const UChar CH_BULLET = 0x2022; 1070 1071 int32_t lineNum = 1; 1072 int32_t colStart = 0; 1073 int32_t column = 0; 1074 int32_t charIdx = 0; 1075 1076 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1077 1078 for (charIdx = 0; charIdx < len; ) { 1079 status = U_ZERO_ERROR; 1080 UChar c = testString.charAt(charIdx); 1081 charIdx++; 1082 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1083 // treat CRLF as a unit 1084 c = CH_LF; 1085 charIdx++; 1086 } 1087 if (c == CH_LF || c == CH_CR) { 1088 lineNum++; 1089 colStart = charIdx; 1090 } 1091 column = charIdx - colStart + 1; 1092 1093 switch (parseState) { 1094 case PARSE_COMMENT: 1095 if (c == 0x0a || c == 0x0d) { 1096 parseState = savedState; 1097 } 1098 break; 1099 1100 case PARSE_TAG: 1101 { 1102 if (c == CH_HASH) { 1103 parseState = PARSE_COMMENT; 1104 savedState = PARSE_TAG; 1105 break; 1106 } 1107 if (u_isUWhiteSpace(c)) { 1108 break; 1109 } 1110 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1111 delete tp.bi; 1112 tp.bi = BreakIterator::createWordInstance(locale, status); 1113 charIdx += 5; 1114 break; 1115 } 1116 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1117 delete tp.bi; 1118 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1119 charIdx += 5; 1120 break; 1121 } 1122 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1123 delete tp.bi; 1124 tp.bi = BreakIterator::createLineInstance(locale, status); 1125 charIdx += 5; 1126 break; 1127 } 1128 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1129 delete tp.bi; 1130 tp.bi = NULL; 1131 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1132 charIdx += 5; 1133 break; 1134 } 1135 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1136 delete tp.bi; 1137 tp.bi = BreakIterator::createTitleInstance(locale, status); 1138 charIdx += 6; 1139 break; 1140 } 1141 1142 // <locale loc_name> 1143 localeMatcher.reset(testString); 1144 if (localeMatcher.lookingAt(charIdx-1, status)) { 1145 UnicodeString localeName = localeMatcher.group(1, status); 1146 char localeName8[100]; 1147 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1148 locale = Locale::createFromName(localeName8); 1149 charIdx += localeMatcher.group(0, status).length() - 1; 1150 TEST_ASSERT_SUCCESS(status); 1151 break; 1152 } 1153 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1154 parseState = PARSE_DATA; 1155 charIdx += 5; 1156 tp.dataToBreak = ""; 1157 tp.expectedBreaks->removeAllElements(); 1158 tp.srcCol ->removeAllElements(); 1159 tp.srcLine->removeAllElements(); 1160 break; 1161 } 1162 1163 errln("line %d: Tag expected in test file.", lineNum); 1164 parseState = PARSE_COMMENT; 1165 savedState = PARSE_DATA; 1166 goto end_test; // Stop the test. 1167 } 1168 break; 1169 1170 case PARSE_DATA: 1171 if (c == CH_BULLET) { 1172 int32_t breakIdx = tp.dataToBreak.length(); 1173 tp.expectedBreaks->setSize(breakIdx+1); 1174 tp.expectedBreaks->setElementAt(-1, breakIdx); 1175 tp.srcLine->setSize(breakIdx+1); 1176 tp.srcLine->setElementAt(lineNum, breakIdx); 1177 tp.srcCol ->setSize(breakIdx+1); 1178 tp.srcCol ->setElementAt(column, breakIdx); 1179 break; 1180 } 1181 1182 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1183 // Add final entry to mappings from break location to source file position. 1184 // Need one extra because last break position returned is after the 1185 // last char in the data, not at the last char. 1186 tp.srcLine->addElement(lineNum, status); 1187 tp.srcCol ->addElement(column, status); 1188 1189 parseState = PARSE_TAG; 1190 charIdx += 6; 1191 1192 // RUN THE TEST! 1193 executeTest(&tp); 1194 break; 1195 } 1196 1197 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1198 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1199 // Get the code point from the name and insert it into the test data. 1200 // (Damn, no API takes names in Unicode !!! 1201 // we've got to take it back to char *) 1202 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1203 int32_t nameLength = nameEndIdx - (charIdx+2); 1204 char charNameBuf[200]; 1205 UChar32 theChar = -1; 1206 if (nameEndIdx != -1) { 1207 UErrorCode status = U_ZERO_ERROR; 1208 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1209 charNameBuf[sizeof(charNameBuf)-1] = 0; 1210 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1211 if (U_FAILURE(status)) { 1212 theChar = -1; 1213 } 1214 } 1215 if (theChar == -1) { 1216 errln("Error in named character in test file at line %d, col %d", 1217 lineNum, column); 1218 } else { 1219 // Named code point was recognized. Insert it 1220 // into the test data. 1221 tp.dataToBreak.append(theChar); 1222 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1223 tp.srcLine->addElement(lineNum, status); 1224 tp.srcCol ->addElement(column, status); 1225 } 1226 } 1227 if (nameEndIdx > charIdx) { 1228 charIdx = nameEndIdx+1; 1229 1230 } 1231 break; 1232 } 1233 1234 1235 1236 1237 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1238 charIdx++; 1239 int32_t breakIdx = tp.dataToBreak.length(); 1240 tp.expectedBreaks->setSize(breakIdx+1); 1241 tp.expectedBreaks->setElementAt(-1, breakIdx); 1242 tp.srcLine->setSize(breakIdx+1); 1243 tp.srcLine->setElementAt(lineNum, breakIdx); 1244 tp.srcCol ->setSize(breakIdx+1); 1245 tp.srcCol ->setElementAt(column, breakIdx); 1246 break; 1247 } 1248 1249 if (c == CH_LT) { 1250 tagValue = 0; 1251 parseState = PARSE_NUM; 1252 break; 1253 } 1254 1255 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1256 parseState = PARSE_COMMENT; 1257 savedState = PARSE_DATA; 1258 break; 1259 } 1260 1261 if (c == CH_BACKSLASH) { 1262 // Check for \ at end of line, a line continuation. 1263 // Advance over (discard) the newline 1264 UChar32 cp = testString.char32At(charIdx); 1265 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1266 // We have a CR LF 1267 // Need an extra increment of the input ptr to move over both of them 1268 charIdx++; 1269 } 1270 if (cp == CH_LF || cp == CH_CR) { 1271 lineNum++; 1272 colStart = charIdx; 1273 charIdx++; 1274 break; 1275 } 1276 1277 // Let unescape handle the back slash. 1278 cp = testString.unescapeAt(charIdx); 1279 if (cp != -1) { 1280 // Escape sequence was recognized. Insert the char 1281 // into the test data. 1282 tp.dataToBreak.append(cp); 1283 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1284 tp.srcLine->addElement(lineNum, status); 1285 tp.srcCol ->addElement(column, status); 1286 } 1287 break; 1288 } 1289 1290 1291 // Not a recognized backslash escape sequence. 1292 // Take the next char as a literal. 1293 // TODO: Should this be an error? 1294 c = testString.charAt(charIdx); 1295 charIdx = testString.moveIndex32(charIdx, 1); 1296 } 1297 1298 // Normal, non-escaped data char. 1299 tp.dataToBreak.append(c); 1300 1301 // Save the mapping from offset in the data to line/column numbers in 1302 // the original input file. Will be used for better error messages only. 1303 // If there's an expected break before this char, the slot in the mapping 1304 // vector will already be set for this char; don't overwrite it. 1305 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1306 tp.srcLine->addElement(lineNum, status); 1307 tp.srcCol ->addElement(column, status); 1308 } 1309 break; 1310 1311 1312 case PARSE_NUM: 1313 // We are parsing an expected numeric tag value, like <1234>, 1314 // within a chunk of data. 1315 if (u_isUWhiteSpace(c)) { 1316 break; 1317 } 1318 1319 if (c == CH_GT) { 1320 // Finished the number. Add the info to the expected break data, 1321 // and switch parse state back to doing plain data. 1322 parseState = PARSE_DATA; 1323 if (tagValue == 0) { 1324 tagValue = -1; 1325 } 1326 int32_t breakIdx = tp.dataToBreak.length(); 1327 tp.expectedBreaks->setSize(breakIdx+1); 1328 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1329 tp.srcLine->setSize(breakIdx+1); 1330 tp.srcLine->setElementAt(lineNum, breakIdx); 1331 tp.srcCol ->setSize(breakIdx+1); 1332 tp.srcCol ->setElementAt(column, breakIdx); 1333 break; 1334 } 1335 1336 if (u_isdigit(c)) { 1337 tagValue = tagValue*10 + u_charDigitValue(c); 1338 break; 1339 } 1340 1341 errln("Syntax Error in test file at line %d, col %d", 1342 lineNum, column); 1343 parseState = PARSE_COMMENT; 1344 goto end_test; // Stop the test 1345 break; 1346 } 1347 1348 1349 if (U_FAILURE(status)) { 1350 dataerrln("ICU Error %s while parsing test file at line %d.", 1351 u_errorName(status), lineNum); 1352 status = U_ZERO_ERROR; 1353 goto end_test; // Stop the test 1354 } 1355 1356 } 1357 1358end_test: 1359 delete tp.bi; 1360 delete tp.expectedBreaks; 1361 delete tp.srcLine; 1362 delete tp.srcCol; 1363 delete [] testFile; 1364#endif 1365} 1366 1367 1368//------------------------------------------------------------------------------- 1369// 1370// TestDictRules create a break iterator from source rules that includes a 1371// dictionary range. Regression for bug #7130. Source rules 1372// do not declare a break iterator type (word, line, sentence, etc. 1373// but the dictionary code, without a type, would loop. 1374// 1375//------------------------------------------------------------------------------- 1376void RBBITest::TestDictRules() { 1377 const char *rules = "$dictionary = [a-z]; \n" 1378 "!!forward; \n" 1379 "$dictionary $dictionary; \n" 1380 "!!reverse; \n" 1381 "$dictionary $dictionary; \n"; 1382 const char *text = "aa"; 1383 UErrorCode status = U_ZERO_ERROR; 1384 UParseError parseError; 1385 1386 RuleBasedBreakIterator bi(rules, parseError, status); 1387 if (U_SUCCESS(status)) { 1388 UnicodeString utext = text; 1389 bi.setText(utext); 1390 int32_t position; 1391 int32_t loops; 1392 for (loops = 0; loops<10; loops++) { 1393 position = bi.next(); 1394 if (position == RuleBasedBreakIterator::DONE) { 1395 break; 1396 } 1397 } 1398 TEST_ASSERT(loops == 1); 1399 } else { 1400 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1401 } 1402} 1403 1404 1405 1406//------------------------------------------------------------------------------- 1407// 1408// ReadAndConvertFile Read a text data file, convert it to UChars, and 1409// return the datain one big UChar * buffer, which the caller must delete. 1410// 1411// parameters: 1412// fileName: the name of the file, with no directory part. The test data directory 1413// is assumed. 1414// ulen an out parameter, receives the actual length (in UChars) of the file data. 1415// encoding The file encoding. If the file contains a BOM, that will override the encoding 1416// specified here. The BOM, if it exists, will be stripped from the returned data. 1417// Pass NULL for the system default encoding. 1418// status 1419// returns: 1420// The file data, converted to UChar. 1421// The caller must delete this when done with 1422// delete [] theBuffer; 1423// 1424// TODO: This is a clone of RegexTest::ReadAndConvertFile. 1425// Move this function to some common place. 1426// 1427//-------------------------------------------------------------------------------- 1428UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1429 UChar *retPtr = NULL; 1430 char *fileBuf = NULL; 1431 UConverter* conv = NULL; 1432 FILE *f = NULL; 1433 1434 ulen = 0; 1435 if (U_FAILURE(status)) { 1436 return retPtr; 1437 } 1438 1439 // 1440 // Open the file. 1441 // 1442 f = fopen(fileName, "rb"); 1443 if (f == 0) { 1444 dataerrln("Error opening test data file %s\n", fileName); 1445 status = U_FILE_ACCESS_ERROR; 1446 return NULL; 1447 } 1448 // 1449 // Read it in 1450 // 1451 int fileSize; 1452 int amt_read; 1453 1454 fseek( f, 0, SEEK_END); 1455 fileSize = ftell(f); 1456 fileBuf = new char[fileSize]; 1457 fseek(f, 0, SEEK_SET); 1458 amt_read = fread(fileBuf, 1, fileSize, f); 1459 if (amt_read != fileSize || fileSize <= 0) { 1460 errln("Error reading test data file."); 1461 goto cleanUpAndReturn; 1462 } 1463 1464 // 1465 // Look for a Unicode Signature (BOM) on the data just read 1466 // 1467 int32_t signatureLength; 1468 const char * fileBufC; 1469 const char* bomEncoding; 1470 1471 fileBufC = fileBuf; 1472 bomEncoding = ucnv_detectUnicodeSignature( 1473 fileBuf, fileSize, &signatureLength, &status); 1474 if(bomEncoding!=NULL ){ 1475 fileBufC += signatureLength; 1476 fileSize -= signatureLength; 1477 encoding = bomEncoding; 1478 } 1479 1480 // 1481 // Open a converter to take the rule file to UTF-16 1482 // 1483 conv = ucnv_open(encoding, &status); 1484 if (U_FAILURE(status)) { 1485 goto cleanUpAndReturn; 1486 } 1487 1488 // 1489 // Convert the rules to UChar. 1490 // Preflight first to determine required buffer size. 1491 // 1492 ulen = ucnv_toUChars(conv, 1493 NULL, // dest, 1494 0, // destCapacity, 1495 fileBufC, 1496 fileSize, 1497 &status); 1498 if (status == U_BUFFER_OVERFLOW_ERROR) { 1499 // Buffer Overflow is expected from the preflight operation. 1500 status = U_ZERO_ERROR; 1501 1502 retPtr = new UChar[ulen+1]; 1503 ucnv_toUChars(conv, 1504 retPtr, // dest, 1505 ulen+1, 1506 fileBufC, 1507 fileSize, 1508 &status); 1509 } 1510 1511cleanUpAndReturn: 1512 fclose(f); 1513 delete []fileBuf; 1514 ucnv_close(conv); 1515 if (U_FAILURE(status)) { 1516 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1517 delete []retPtr; 1518 retPtr = 0; 1519 ulen = 0; 1520 }; 1521 return retPtr; 1522} 1523 1524 1525 1526//-------------------------------------------------------------------------------------------- 1527// 1528// Run tests from each of the boundary test data files distributed by the Unicode Consortium 1529// 1530//------------------------------------------------------------------------------------------- 1531void RBBITest::TestUnicodeFiles() { 1532 RuleBasedBreakIterator *bi; 1533 UErrorCode status = U_ZERO_ERROR; 1534 1535 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1536 TEST_ASSERT_SUCCESS(status); 1537 if (U_SUCCESS(status)) { 1538 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1539 } 1540 delete bi; 1541 1542 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1543 TEST_ASSERT_SUCCESS(status); 1544 if (U_SUCCESS(status)) { 1545 runUnicodeTestData("WordBreakTest.txt", bi); 1546 } 1547 delete bi; 1548 1549 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1550 TEST_ASSERT_SUCCESS(status); 1551 if (U_SUCCESS(status)) { 1552 runUnicodeTestData("SentenceBreakTest.txt", bi); 1553 } 1554 delete bi; 1555 1556 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1557 TEST_ASSERT_SUCCESS(status); 1558 if (U_SUCCESS(status)) { 1559 runUnicodeTestData("LineBreakTest.txt", bi); 1560 } 1561 delete bi; 1562} 1563 1564 1565//-------------------------------------------------------------------------------------------- 1566// 1567// Run tests from one of the boundary test data files distributed by the Unicode Consortium 1568// 1569//------------------------------------------------------------------------------------------- 1570void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1571#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1572 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270 1573 UBool isTicket7270Fixed = !logKnownIssue("7270"); 1574 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 1575 UErrorCode status = U_ZERO_ERROR; 1576 1577 // 1578 // Open and read the test data file, put it into a UnicodeString. 1579 // 1580 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1581 char testFileName[1000]; 1582 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1583 dataerrln("Can't open test data. Path too long."); 1584 return; 1585 } 1586 strcpy(testFileName, testDataDirectory); 1587 strcat(testFileName, fileName); 1588 1589 logln("Opening data file %s\n", fileName); 1590 1591 int len; 1592 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1593 if (status != U_FILE_ACCESS_ERROR) { 1594 TEST_ASSERT_SUCCESS(status); 1595 TEST_ASSERT(testFile != NULL); 1596 } 1597 if (U_FAILURE(status) || testFile == NULL) { 1598 return; /* something went wrong, error already output */ 1599 } 1600 UnicodeString testFileAsString(TRUE, testFile, len); 1601 1602 // 1603 // Parse the test data file using a regular expression. 1604 // Each kind of token is recognized in its own capture group; what type of item was scanned 1605 // is identified by which group had a match. 1606 // 1607 // Caputure Group # 1 2 3 4 5 1608 // Parses this item: divide x hex digits comment \n unrecognized \n 1609 // 1610 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1611 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1612 UnicodeString testString; 1613 UVector32 breakPositions(status); 1614 int lineNumber = 1; 1615 TEST_ASSERT_SUCCESS(status); 1616 if (U_FAILURE(status)) { 1617 return; 1618 } 1619 1620 // 1621 // Scan through each test case, building up the string to be broken in testString, 1622 // and the positions that should be boundaries in the breakPositions vector. 1623 // 1624 int spin = 0; 1625 while (tokenMatcher.find()) { 1626 if(tokenMatcher.hitEnd()) { 1627 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1628 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1629 and caused an infinite loop here on EBCDIC systems! 1630 */ 1631 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1632 // return; 1633 } 1634 if (tokenMatcher.start(1, status) >= 0) { 1635 // Scanned a divide sign, indicating a break position in the test data. 1636 if (testString.length()>0) { 1637 breakPositions.addElement(testString.length(), status); 1638 } 1639 } 1640 else if (tokenMatcher.start(2, status) >= 0) { 1641 // Scanned an 'x', meaning no break at this position in the test data 1642 // Nothing to be done here. 1643 } 1644 else if (tokenMatcher.start(3, status) >= 0) { 1645 // Scanned Hex digits. Convert them to binary, append to the character data string. 1646 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1647 int length = hexNumber.length(); 1648 if (length<=8) { 1649 char buf[10]; 1650 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1651 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1652 if (c<=0x10ffff) { 1653 testString.append(c); 1654 } else { 1655 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1656 fileName, lineNumber); 1657 } 1658 } else { 1659 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1660 fileName, lineNumber); 1661 } 1662 } 1663 else if (tokenMatcher.start(4, status) >= 0) { 1664 // Scanned to end of a line, possibly skipping over a comment in the process. 1665 // If the line from the file contained test data, run the test now. 1666 // 1667 if (testString.length() > 0) { 1668// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data. 1669// Rule 8 1670// ZW SP* <break> 1671// is not yet implemented. 1672if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber || 1673 5202 == lineNumber || 1674 5214 == lineNumber || 1675 5246 == lineNumber || 1676 5298 == lineNumber || 1677 5302 == lineNumber ))) { 1678 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1679} 1680 } 1681 1682 // Clear out this test case. 1683 // The string and breakPositions vector will be refilled as the next 1684 // test case is parsed. 1685 testString.remove(); 1686 breakPositions.removeAllElements(); 1687 lineNumber++; 1688 } else { 1689 // Scanner catchall. Something unrecognized appeared on the line. 1690 char token[16]; 1691 UnicodeString uToken = tokenMatcher.group(0, status); 1692 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1693 token[sizeof(token)-1] = 0; 1694 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1695 1696 // Clean up, in preparation for continuing with the next line. 1697 testString.remove(); 1698 breakPositions.removeAllElements(); 1699 lineNumber++; 1700 } 1701 TEST_ASSERT_SUCCESS(status); 1702 if (U_FAILURE(status)) { 1703 break; 1704 } 1705 } 1706 1707 delete [] testFile; 1708 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1709} 1710 1711//-------------------------------------------------------------------------------------------- 1712// 1713// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1714// test data files. Do only a simple, forward-only check - 1715// this test is mostly to check that ICU and the Unicode 1716// data agree with each other. 1717// 1718//-------------------------------------------------------------------------------------------- 1719void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1720 const UnicodeString &testString, // Text data to be broken 1721 UVector32 *breakPositions, // Positions where breaks should be found. 1722 RuleBasedBreakIterator *bi) { 1723 int32_t pos; // Break Position in the test string 1724 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1725 int32_t expectedPos; // Expected break position (index into test string) 1726 1727 bi->setText(testString); 1728 pos = bi->first(); 1729 pos = bi->next(); 1730 1731 while (pos != BreakIterator::DONE) { 1732 if (expectedI >= breakPositions->size()) { 1733 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1734 testFileName, lineNumber, pos); 1735 break; 1736 } 1737 expectedPos = breakPositions->elementAti(expectedI); 1738 if (pos < expectedPos) { 1739 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1740 testFileName, lineNumber, pos); 1741 break; 1742 } 1743 if (pos > expectedPos) { 1744 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1745 testFileName, lineNumber, expectedPos); 1746 break; 1747 } 1748 pos = bi->next(); 1749 expectedI++; 1750 } 1751 1752 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1753 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1754 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1755 } 1756} 1757 1758 1759 1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1761//--------------------------------------------------------------------------------------- 1762// 1763// classs RBBIMonkeyKind 1764// 1765// Monkey Test for Break Iteration 1766// Abstract interface class. Concrete derived classes independently 1767// implement the break rules for different iterator types. 1768// 1769// The Monkey Test itself uses doesn't know which type of break iterator it is 1770// testing, but works purely in terms of the interface defined here. 1771// 1772//--------------------------------------------------------------------------------------- 1773class RBBIMonkeyKind { 1774public: 1775 // Return a UVector of UnicodeSets, representing the character classes used 1776 // for this type of iterator. 1777 virtual UVector *charClasses() = 0; 1778 1779 // Set the test text on which subsequent calls to next() will operate 1780 virtual void setText(const UnicodeString &s) = 0; 1781 1782 // Find the next break postion, starting from the prev break position, or from zero. 1783 // Return -1 after reaching end of string. 1784 virtual int32_t next(int32_t i) = 0; 1785 1786 virtual ~RBBIMonkeyKind(); 1787 UErrorCode deferredStatus; 1788 1789 1790protected: 1791 RBBIMonkeyKind(); 1792 1793private: 1794}; 1795 1796RBBIMonkeyKind::RBBIMonkeyKind() { 1797 deferredStatus = U_ZERO_ERROR; 1798} 1799 1800RBBIMonkeyKind::~RBBIMonkeyKind() { 1801} 1802 1803 1804//---------------------------------------------------------------------------------------- 1805// 1806// Random Numbers. Similar to standard lib rand() and srand() 1807// Not using library to 1808// 1. Get same results on all platforms. 1809// 2. Get access to current seed, to more easily reproduce failures. 1810// 1811//--------------------------------------------------------------------------------------- 1812static uint32_t m_seed = 1; 1813 1814static uint32_t m_rand() 1815{ 1816 m_seed = m_seed * 1103515245 + 12345; 1817 return (uint32_t)(m_seed/65536) % 32768; 1818} 1819 1820 1821//------------------------------------------------------------------------------------------ 1822// 1823// class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1824// of RBBIMonkeyKind. 1825// 1826//------------------------------------------------------------------------------------------ 1827class RBBICharMonkey: public RBBIMonkeyKind { 1828public: 1829 RBBICharMonkey(); 1830 virtual ~RBBICharMonkey(); 1831 virtual UVector *charClasses(); 1832 virtual void setText(const UnicodeString &s); 1833 virtual int32_t next(int32_t i); 1834private: 1835 UVector *fSets; 1836 1837 UnicodeSet *fCRLFSet; 1838 UnicodeSet *fControlSet; 1839 UnicodeSet *fExtendSet; 1840 UnicodeSet *fRegionalIndicatorSet; 1841 UnicodeSet *fPrependSet; 1842 UnicodeSet *fSpacingSet; 1843 UnicodeSet *fLSet; 1844 UnicodeSet *fVSet; 1845 UnicodeSet *fTSet; 1846 UnicodeSet *fLVSet; 1847 UnicodeSet *fLVTSet; 1848 UnicodeSet *fHangulSet; 1849 UnicodeSet *fAnySet; 1850 1851 const UnicodeString *fText; 1852}; 1853 1854 1855RBBICharMonkey::RBBICharMonkey() { 1856 UErrorCode status = U_ZERO_ERROR; 1857 1858 fText = NULL; 1859 1860 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 1861 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 1862 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 1863 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 1864 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 1865 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 1866 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 1867 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 1868 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 1869 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 1870 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 1871 fHangulSet = new UnicodeSet(); 1872 fHangulSet->addAll(*fLSet); 1873 fHangulSet->addAll(*fVSet); 1874 fHangulSet->addAll(*fTSet); 1875 fHangulSet->addAll(*fLVSet); 1876 fHangulSet->addAll(*fLVTSet); 1877 fAnySet = new UnicodeSet(0, 0x10ffff); 1878 1879 fSets = new UVector(status); 1880 fSets->addElement(fCRLFSet, status); 1881 fSets->addElement(fControlSet, status); 1882 fSets->addElement(fExtendSet, status); 1883 fSets->addElement(fRegionalIndicatorSet, status); 1884 if (!fPrependSet->isEmpty()) { 1885 fSets->addElement(fPrependSet, status); 1886 } 1887 fSets->addElement(fSpacingSet, status); 1888 fSets->addElement(fHangulSet, status); 1889 fSets->addElement(fAnySet, status); 1890 if (U_FAILURE(status)) { 1891 deferredStatus = status; 1892 } 1893} 1894 1895 1896void RBBICharMonkey::setText(const UnicodeString &s) { 1897 fText = &s; 1898} 1899 1900 1901 1902int32_t RBBICharMonkey::next(int32_t prevPos) { 1903 int p0, p1, p2, p3; // Indices of the significant code points around the 1904 // break position being tested. The candidate break 1905 // location is before p2. 1906 1907 int breakPos = -1; 1908 1909 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1910 1911 if (U_FAILURE(deferredStatus)) { 1912 return -1; 1913 } 1914 1915 // Previous break at end of string. return DONE. 1916 if (prevPos >= fText->length()) { 1917 return -1; 1918 } 1919 p0 = p1 = p2 = p3 = prevPos; 1920 c3 = fText->char32At(prevPos); 1921 c0 = c1 = c2 = 0; 1922 (void)p0; // suppress set but not used warning. 1923 (void)c0; 1924 1925 // Loop runs once per "significant" character position in the input text. 1926 for (;;) { 1927 // Move all of the positions forward in the input string. 1928 p0 = p1; c0 = c1; 1929 p1 = p2; c1 = c2; 1930 p2 = p3; c2 = c3; 1931 1932 // Advancd p3 by one codepoint 1933 p3 = fText->moveIndex32(p3, 1); 1934 c3 = fText->char32At(p3); 1935 1936 if (p1 == p2) { 1937 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1938 continue; 1939 } 1940 if (p2 == fText->length()) { 1941 // Reached end of string. Always a break position. 1942 break; 1943 } 1944 1945 // Rule GB3 CR x LF 1946 // No Extend or Format characters may appear between the CR and LF, 1947 // which requires the additional check for p2 immediately following p1. 1948 // 1949 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 1950 continue; 1951 } 1952 1953 // Rule (GB4). ( Control | CR | LF ) <break> 1954 if (fControlSet->contains(c1) || 1955 c1 == 0x0D || 1956 c1 == 0x0A) { 1957 break; 1958 } 1959 1960 // Rule (GB5) <break> ( Control | CR | LF ) 1961 // 1962 if (fControlSet->contains(c2) || 1963 c2 == 0x0D || 1964 c2 == 0x0A) { 1965 break; 1966 } 1967 1968 1969 // Rule (GB6) L x ( L | V | LV | LVT ) 1970 if (fLSet->contains(c1) && 1971 (fLSet->contains(c2) || 1972 fVSet->contains(c2) || 1973 fLVSet->contains(c2) || 1974 fLVTSet->contains(c2))) { 1975 continue; 1976 } 1977 1978 // Rule (GB7) ( LV | V ) x ( V | T ) 1979 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 1980 (fVSet->contains(c2) || fTSet->contains(c2))) { 1981 continue; 1982 } 1983 1984 // Rule (GB8) ( LVT | T) x T 1985 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 1986 fTSet->contains(c2)) { 1987 continue; 1988 } 1989 1990 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context 1991 1992 // Rule (GB8a) Regional_Indicator x Regional_Indicator 1993 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 1994 continue; 1995 } 1996 1997 // Rule (GB9) Numeric x ALetter 1998 if (fExtendSet->contains(c2)) { 1999 continue; 2000 } 2001 2002 // Rule (GB9a) x SpacingMark 2003 if (fSpacingSet->contains(c2)) { 2004 continue; 2005 } 2006 2007 // Rule (GB9b) Prepend x 2008 if (fPrependSet->contains(c1)) { 2009 continue; 2010 } 2011 2012 // Rule (GB10) Any <break> Any 2013 break; 2014 } 2015 2016 breakPos = p2; 2017 return breakPos; 2018} 2019 2020 2021 2022UVector *RBBICharMonkey::charClasses() { 2023 return fSets; 2024} 2025 2026 2027RBBICharMonkey::~RBBICharMonkey() { 2028 delete fSets; 2029 delete fCRLFSet; 2030 delete fControlSet; 2031 delete fExtendSet; 2032 delete fRegionalIndicatorSet; 2033 delete fPrependSet; 2034 delete fSpacingSet; 2035 delete fLSet; 2036 delete fVSet; 2037 delete fTSet; 2038 delete fLVSet; 2039 delete fLVTSet; 2040 delete fHangulSet; 2041 delete fAnySet; 2042} 2043 2044//------------------------------------------------------------------------------------------ 2045// 2046// class RBBIWordMonkey Word Break specific implementation 2047// of RBBIMonkeyKind. 2048// 2049//------------------------------------------------------------------------------------------ 2050class RBBIWordMonkey: public RBBIMonkeyKind { 2051public: 2052 RBBIWordMonkey(); 2053 virtual ~RBBIWordMonkey(); 2054 virtual UVector *charClasses(); 2055 virtual void setText(const UnicodeString &s); 2056 virtual int32_t next(int32_t i); 2057private: 2058 UVector *fSets; 2059 2060 UnicodeSet *fCRSet; 2061 UnicodeSet *fLFSet; 2062 UnicodeSet *fNewlineSet; 2063 UnicodeSet *fRegionalIndicatorSet; 2064 UnicodeSet *fKatakanaSet; 2065 UnicodeSet *fHebrew_LetterSet; 2066 UnicodeSet *fALetterSet; 2067 // TODO(jungshik): Do we still need this change? 2068 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2069 UnicodeSet *fSingle_QuoteSet; 2070 UnicodeSet *fDouble_QuoteSet; 2071 UnicodeSet *fMidNumLetSet; 2072 UnicodeSet *fMidLetterSet; 2073 UnicodeSet *fMidNumSet; 2074 UnicodeSet *fNumericSet; 2075 UnicodeSet *fFormatSet; 2076 UnicodeSet *fOtherSet; 2077 UnicodeSet *fExtendSet; 2078 UnicodeSet *fExtendNumLetSet; 2079 UnicodeSet *fDictionaryCjkSet; 2080 2081 const UnicodeString *fText; 2082}; 2083 2084 2085RBBIWordMonkey::RBBIWordMonkey() 2086{ 2087 UErrorCode status = U_ZERO_ERROR; 2088 2089 fSets = new UVector(status); 2090 2091 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2092 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2093 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2094 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2095 // Exclude Hangul syllables from ALetterSet during testing. 2096 // Leave CJK dictionary characters out from the monkey tests! 2097#if 0 2098 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2099 "[\\p{Line_Break = Complex_Context}" 2100 "-\\p{Grapheme_Cluster_Break = Extend}" 2101 "-\\p{Grapheme_Cluster_Break = Control}" 2102 "]]", 2103 status); 2104#endif 2105 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2106 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2107 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 2108 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2109 fALetterSet->removeAll(*fDictionaryCjkSet); 2110 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 2111 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 2112 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2113 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2114 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2115 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2116 // we should figure out why 2117 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2118 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2119 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2120 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2121 2122 fOtherSet = new UnicodeSet(); 2123 if(U_FAILURE(status)) { 2124 deferredStatus = status; 2125 return; 2126 } 2127 2128 fOtherSet->complement(); 2129 fOtherSet->removeAll(*fCRSet); 2130 fOtherSet->removeAll(*fLFSet); 2131 fOtherSet->removeAll(*fNewlineSet); 2132 fOtherSet->removeAll(*fKatakanaSet); 2133 fOtherSet->removeAll(*fHebrew_LetterSet); 2134 fOtherSet->removeAll(*fALetterSet); 2135 fOtherSet->removeAll(*fSingle_QuoteSet); 2136 fOtherSet->removeAll(*fDouble_QuoteSet); 2137 fOtherSet->removeAll(*fMidLetterSet); 2138 fOtherSet->removeAll(*fMidNumSet); 2139 fOtherSet->removeAll(*fNumericSet); 2140 fOtherSet->removeAll(*fExtendNumLetSet); 2141 fOtherSet->removeAll(*fFormatSet); 2142 fOtherSet->removeAll(*fExtendSet); 2143 fOtherSet->removeAll(*fRegionalIndicatorSet); 2144 // Inhibit dictionary characters from being tested at all. 2145 fOtherSet->removeAll(*fDictionaryCjkSet); 2146 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2147 2148 fSets->addElement(fCRSet, status); 2149 fSets->addElement(fLFSet, status); 2150 fSets->addElement(fNewlineSet, status); 2151 fSets->addElement(fRegionalIndicatorSet, status); 2152 fSets->addElement(fHebrew_LetterSet, status); 2153 fSets->addElement(fALetterSet, status); 2154 fSets->addElement(fSingle_QuoteSet, status); 2155 fSets->addElement(fDouble_QuoteSet, status); 2156 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2157 fSets->addElement(fMidLetterSet, status); 2158 fSets->addElement(fMidNumLetSet, status); 2159 fSets->addElement(fMidNumSet, status); 2160 fSets->addElement(fNumericSet, status); 2161 fSets->addElement(fFormatSet, status); 2162 fSets->addElement(fExtendSet, status); 2163 fSets->addElement(fOtherSet, status); 2164 fSets->addElement(fExtendNumLetSet, status); 2165 2166 if (U_FAILURE(status)) { 2167 deferredStatus = status; 2168 } 2169} 2170 2171void RBBIWordMonkey::setText(const UnicodeString &s) { 2172 fText = &s; 2173} 2174 2175 2176int32_t RBBIWordMonkey::next(int32_t prevPos) { 2177 int p0, p1, p2, p3; // Indices of the significant code points around the 2178 // break position being tested. The candidate break 2179 // location is before p2. 2180 2181 int breakPos = -1; 2182 2183 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2184 2185 if (U_FAILURE(deferredStatus)) { 2186 return -1; 2187 } 2188 2189 // Prev break at end of string. return DONE. 2190 if (prevPos >= fText->length()) { 2191 return -1; 2192 } 2193 p0 = p1 = p2 = p3 = prevPos; 2194 c3 = fText->char32At(prevPos); 2195 c0 = c1 = c2 = 0; 2196 (void)p0; // Suppress set but not used warning. 2197 2198 // Loop runs once per "significant" character position in the input text. 2199 for (;;) { 2200 // Move all of the positions forward in the input string. 2201 p0 = p1; c0 = c1; 2202 p1 = p2; c1 = c2; 2203 p2 = p3; c2 = c3; 2204 2205 // Advancd p3 by X(Extend | Format)* Rule 4 2206 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2207 do { 2208 p3 = fText->moveIndex32(p3, 1); 2209 c3 = fText->char32At(p3); 2210 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2211 break; 2212 }; 2213 } 2214 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2215 2216 2217 if (p1 == p2) { 2218 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2219 continue; 2220 } 2221 if (p2 == fText->length()) { 2222 // Reached end of string. Always a break position. 2223 break; 2224 } 2225 2226 // Rule (3) CR x LF 2227 // No Extend or Format characters may appear between the CR and LF, 2228 // which requires the additional check for p2 immediately following p1. 2229 // 2230 if (c1==0x0D && c2==0x0A) { 2231 continue; 2232 } 2233 2234 // Rule (3a) Break before and after newlines (including CR and LF) 2235 // 2236 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2237 break; 2238 }; 2239 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2240 break; 2241 }; 2242 2243 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2244 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2245 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2246 continue; 2247 } 2248 2249 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2250 // 2251 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2252 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2253 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2254 continue; 2255 } 2256 2257 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2258 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2259 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2260 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2261 continue; 2262 } 2263 2264 // Rule (7a) Hebrew_Letter x Single_Quote 2265 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2266 continue; 2267 } 2268 2269 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2270 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2271 continue; 2272 } 2273 2274 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2275 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2276 continue; 2277 } 2278 2279 // Rule (8) Numeric x Numeric 2280 if (fNumericSet->contains(c1) && 2281 fNumericSet->contains(c2)) { 2282 continue; 2283 } 2284 2285 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2286 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2287 fNumericSet->contains(c2)) { 2288 continue; 2289 } 2290 2291 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2292 if (fNumericSet->contains(c1) && 2293 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2294 continue; 2295 } 2296 2297 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2298 if (fNumericSet->contains(c0) && 2299 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2300 fNumericSet->contains(c2)) { 2301 continue; 2302 } 2303 2304 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2305 if (fNumericSet->contains(c1) && 2306 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2307 fNumericSet->contains(c3)) { 2308 continue; 2309 } 2310 2311 // Rule (13) Katakana x Katakana 2312 if (fKatakanaSet->contains(c1) && 2313 fKatakanaSet->contains(c2)) { 2314 continue; 2315 } 2316 2317 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2318 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2319 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2320 fExtendNumLetSet->contains(c2)) { 2321 continue; 2322 } 2323 2324 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2325 if (fExtendNumLetSet->contains(c1) && 2326 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2327 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2328 continue; 2329 } 2330 2331 // Rule 13c 2332 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2333 continue; 2334 } 2335 2336 // Rule 14. Break found here. 2337 break; 2338 } 2339 2340 breakPos = p2; 2341 return breakPos; 2342} 2343 2344 2345UVector *RBBIWordMonkey::charClasses() { 2346 return fSets; 2347} 2348 2349 2350RBBIWordMonkey::~RBBIWordMonkey() { 2351 delete fSets; 2352 delete fCRSet; 2353 delete fLFSet; 2354 delete fNewlineSet; 2355 delete fKatakanaSet; 2356 delete fHebrew_LetterSet; 2357 delete fALetterSet; 2358 delete fSingle_QuoteSet; 2359 delete fDouble_QuoteSet; 2360 delete fMidNumLetSet; 2361 delete fMidLetterSet; 2362 delete fMidNumSet; 2363 delete fNumericSet; 2364 delete fFormatSet; 2365 delete fExtendSet; 2366 delete fExtendNumLetSet; 2367 delete fRegionalIndicatorSet; 2368 delete fDictionaryCjkSet; 2369 delete fOtherSet; 2370} 2371 2372 2373 2374 2375//------------------------------------------------------------------------------------------ 2376// 2377// class RBBISentMonkey Sentence Break specific implementation 2378// of RBBIMonkeyKind. 2379// 2380//------------------------------------------------------------------------------------------ 2381class RBBISentMonkey: public RBBIMonkeyKind { 2382public: 2383 RBBISentMonkey(); 2384 virtual ~RBBISentMonkey(); 2385 virtual UVector *charClasses(); 2386 virtual void setText(const UnicodeString &s); 2387 virtual int32_t next(int32_t i); 2388private: 2389 int moveBack(int posFrom); 2390 int moveForward(int posFrom); 2391 UChar32 cAt(int pos); 2392 2393 UVector *fSets; 2394 2395 UnicodeSet *fSepSet; 2396 UnicodeSet *fFormatSet; 2397 UnicodeSet *fSpSet; 2398 UnicodeSet *fLowerSet; 2399 UnicodeSet *fUpperSet; 2400 UnicodeSet *fOLetterSet; 2401 UnicodeSet *fNumericSet; 2402 UnicodeSet *fATermSet; 2403 UnicodeSet *fSContinueSet; 2404 UnicodeSet *fSTermSet; 2405 UnicodeSet *fCloseSet; 2406 UnicodeSet *fOtherSet; 2407 UnicodeSet *fExtendSet; 2408 2409 const UnicodeString *fText; 2410 2411}; 2412 2413RBBISentMonkey::RBBISentMonkey() 2414{ 2415 UErrorCode status = U_ZERO_ERROR; 2416 2417 fSets = new UVector(status); 2418 2419 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2420 // set and made into character classes of their own. For the monkey impl, 2421 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2422 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2423 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2424 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2425 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2426 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2427 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2428 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2429 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2430 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2431 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2432 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2433 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2434 fOtherSet = new UnicodeSet(); 2435 2436 if(U_FAILURE(status)) { 2437 deferredStatus = status; 2438 return; 2439 } 2440 2441 fOtherSet->complement(); 2442 fOtherSet->removeAll(*fSepSet); 2443 fOtherSet->removeAll(*fFormatSet); 2444 fOtherSet->removeAll(*fSpSet); 2445 fOtherSet->removeAll(*fLowerSet); 2446 fOtherSet->removeAll(*fUpperSet); 2447 fOtherSet->removeAll(*fOLetterSet); 2448 fOtherSet->removeAll(*fNumericSet); 2449 fOtherSet->removeAll(*fATermSet); 2450 fOtherSet->removeAll(*fSContinueSet); 2451 fOtherSet->removeAll(*fSTermSet); 2452 fOtherSet->removeAll(*fCloseSet); 2453 fOtherSet->removeAll(*fExtendSet); 2454 2455 fSets->addElement(fSepSet, status); 2456 fSets->addElement(fFormatSet, status); 2457 fSets->addElement(fSpSet, status); 2458 fSets->addElement(fLowerSet, status); 2459 fSets->addElement(fUpperSet, status); 2460 fSets->addElement(fOLetterSet, status); 2461 fSets->addElement(fNumericSet, status); 2462 fSets->addElement(fATermSet, status); 2463 fSets->addElement(fSContinueSet, status); 2464 fSets->addElement(fSTermSet, status); 2465 fSets->addElement(fCloseSet, status); 2466 fSets->addElement(fOtherSet, status); 2467 fSets->addElement(fExtendSet, status); 2468 2469 if (U_FAILURE(status)) { 2470 deferredStatus = status; 2471 } 2472} 2473 2474 2475 2476void RBBISentMonkey::setText(const UnicodeString &s) { 2477 fText = &s; 2478} 2479 2480UVector *RBBISentMonkey::charClasses() { 2481 return fSets; 2482} 2483 2484 2485// moveBack() Find the "significant" code point preceding the index i. 2486// Skips over ($Extend | $Format)* . 2487// 2488int RBBISentMonkey::moveBack(int i) { 2489 if (i <= 0) { 2490 return -1; 2491 } 2492 UChar32 c; 2493 int32_t j = i; 2494 do { 2495 j = fText->moveIndex32(j, -1); 2496 c = fText->char32At(j); 2497 } 2498 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2499 return j; 2500 2501 } 2502 2503 2504int RBBISentMonkey::moveForward(int i) { 2505 if (i>=fText->length()) { 2506 return fText->length(); 2507 } 2508 UChar32 c; 2509 int32_t j = i; 2510 do { 2511 j = fText->moveIndex32(j, 1); 2512 c = cAt(j); 2513 } 2514 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2515 return j; 2516} 2517 2518UChar32 RBBISentMonkey::cAt(int pos) { 2519 if (pos<0 || pos>=fText->length()) { 2520 return -1; 2521 } else { 2522 return fText->char32At(pos); 2523 } 2524} 2525 2526int32_t RBBISentMonkey::next(int32_t prevPos) { 2527 int p0, p1, p2, p3; // Indices of the significant code points around the 2528 // break position being tested. The candidate break 2529 // location is before p2. 2530 2531 int breakPos = -1; 2532 2533 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2534 UChar32 c; 2535 2536 if (U_FAILURE(deferredStatus)) { 2537 return -1; 2538 } 2539 2540 // Prev break at end of string. return DONE. 2541 if (prevPos >= fText->length()) { 2542 return -1; 2543 } 2544 p0 = p1 = p2 = p3 = prevPos; 2545 c3 = fText->char32At(prevPos); 2546 c0 = c1 = c2 = 0; 2547 (void)p0; // Suppress set but not used warning. 2548 2549 // Loop runs once per "significant" character position in the input text. 2550 for (;;) { 2551 // Move all of the positions forward in the input string. 2552 p0 = p1; c0 = c1; 2553 p1 = p2; c1 = c2; 2554 p2 = p3; c2 = c3; 2555 2556 // Advancd p3 by X(Extend | Format)* Rule 4 2557 p3 = moveForward(p3); 2558 c3 = cAt(p3); 2559 2560 // Rule (3) CR x LF 2561 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2562 continue; 2563 } 2564 2565 // Rule (4). Sep <break> 2566 if (fSepSet->contains(c1)) { 2567 p2 = p1+1; // Separators don't combine with Extend or Format. 2568 break; 2569 } 2570 2571 if (p2 >= fText->length()) { 2572 // Reached end of string. Always a break position. 2573 break; 2574 } 2575 2576 if (p2 == prevPos) { 2577 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2578 continue; 2579 } 2580 2581 // Rule (6). ATerm x Numeric 2582 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2583 continue; 2584 } 2585 2586 // Rule (7). Upper ATerm x Uppper 2587 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2588 continue; 2589 } 2590 2591 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2592 // Note: STerm | ATerm are added to the negated part of the expression by a 2593 // note to the Unicode 5.0 documents. 2594 int p8 = p1; 2595 while (fSpSet->contains(cAt(p8))) { 2596 p8 = moveBack(p8); 2597 } 2598 while (fCloseSet->contains(cAt(p8))) { 2599 p8 = moveBack(p8); 2600 } 2601 if (fATermSet->contains(cAt(p8))) { 2602 p8=p2; 2603 for (;;) { 2604 c = cAt(p8); 2605 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2606 fLowerSet->contains(c) || fSepSet->contains(c) || 2607 fATermSet->contains(c) || fSTermSet->contains(c)) { 2608 break; 2609 } 2610 p8 = moveForward(p8); 2611 } 2612 if (fLowerSet->contains(cAt(p8))) { 2613 continue; 2614 } 2615 } 2616 2617 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2618 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2619 p8 = p1; 2620 while (fSpSet->contains(cAt(p8))) { 2621 p8 = moveBack(p8); 2622 } 2623 while (fCloseSet->contains(cAt(p8))) { 2624 p8 = moveBack(p8); 2625 } 2626 c = cAt(p8); 2627 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2628 continue; 2629 } 2630 } 2631 2632 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2633 int p9 = p1; 2634 while (fCloseSet->contains(cAt(p9))) { 2635 p9 = moveBack(p9); 2636 } 2637 c = cAt(p9); 2638 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2639 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2640 continue; 2641 } 2642 } 2643 2644 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2645 int p10 = p1; 2646 while (fSpSet->contains(cAt(p10))) { 2647 p10 = moveBack(p10); 2648 } 2649 while (fCloseSet->contains(cAt(p10))) { 2650 p10 = moveBack(p10); 2651 } 2652 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2653 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2654 continue; 2655 } 2656 } 2657 2658 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2659 int p11 = p1; 2660 if (fSepSet->contains(cAt(p11))) { 2661 p11 = moveBack(p11); 2662 } 2663 while (fSpSet->contains(cAt(p11))) { 2664 p11 = moveBack(p11); 2665 } 2666 while (fCloseSet->contains(cAt(p11))) { 2667 p11 = moveBack(p11); 2668 } 2669 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2670 break; 2671 } 2672 2673 // Rule (12) Any x Any 2674 continue; 2675 } 2676 breakPos = p2; 2677 return breakPos; 2678} 2679 2680RBBISentMonkey::~RBBISentMonkey() { 2681 delete fSets; 2682 delete fSepSet; 2683 delete fFormatSet; 2684 delete fSpSet; 2685 delete fLowerSet; 2686 delete fUpperSet; 2687 delete fOLetterSet; 2688 delete fNumericSet; 2689 delete fATermSet; 2690 delete fSContinueSet; 2691 delete fSTermSet; 2692 delete fCloseSet; 2693 delete fOtherSet; 2694 delete fExtendSet; 2695} 2696 2697 2698 2699//------------------------------------------------------------------------------------------- 2700// 2701// RBBILineMonkey 2702// 2703//------------------------------------------------------------------------------------------- 2704 2705class RBBILineMonkey: public RBBIMonkeyKind { 2706public: 2707 RBBILineMonkey(); 2708 virtual ~RBBILineMonkey(); 2709 virtual UVector *charClasses(); 2710 virtual void setText(const UnicodeString &s); 2711 virtual int32_t next(int32_t i); 2712 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2713private: 2714 UVector *fSets; 2715 2716 UnicodeSet *fBK; 2717 UnicodeSet *fCR; 2718 UnicodeSet *fLF; 2719 UnicodeSet *fCM; 2720 UnicodeSet *fNL; 2721 UnicodeSet *fSG; 2722 UnicodeSet *fWJ; 2723 UnicodeSet *fZW; 2724 UnicodeSet *fGL; 2725 UnicodeSet *fCB; 2726 UnicodeSet *fSP; 2727 UnicodeSet *fB2; 2728 UnicodeSet *fBA; 2729 UnicodeSet *fBB; 2730 UnicodeSet *fHY; 2731 UnicodeSet *fH2; 2732 UnicodeSet *fH3; 2733 UnicodeSet *fCL; 2734 UnicodeSet *fCP; 2735 UnicodeSet *fEX; 2736 UnicodeSet *fIN; 2737 UnicodeSet *fJL; 2738 UnicodeSet *fJV; 2739 UnicodeSet *fJT; 2740 UnicodeSet *fNS; 2741 UnicodeSet *fOP; 2742 UnicodeSet *fQU; 2743 UnicodeSet *fIS; 2744 UnicodeSet *fNU; 2745 UnicodeSet *fPO; 2746 UnicodeSet *fPR; 2747 UnicodeSet *fSY; 2748 UnicodeSet *fAI; 2749 UnicodeSet *fAL; 2750 UnicodeSet *fCJ; 2751 UnicodeSet *fHL; 2752 UnicodeSet *fID; 2753 UnicodeSet *fRI; 2754 UnicodeSet *fSA; 2755 UnicodeSet *fXX; 2756 2757 BreakIterator *fCharBI; 2758 const UnicodeString *fText; 2759 RegexMatcher *fNumberMatcher; 2760}; 2761 2762 2763RBBILineMonkey::RBBILineMonkey() 2764{ 2765 UErrorCode status = U_ZERO_ERROR; 2766 2767 fSets = new UVector(status); 2768 2769 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2770 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2771 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2772 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2773 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2774 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2775 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2776 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2777 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2778 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2779 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2780 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2781 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2782 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2783 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2784 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2785 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2786 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2787 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2788 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2789 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2790 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2791 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2792 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2793 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2794 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2795 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2796 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2797 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2798 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2799 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2800 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2801 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2802 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2803 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2804 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2805 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2806 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2807 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2808 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2809 2810 if (U_FAILURE(status)) { 2811 deferredStatus = status; 2812 fCharBI = NULL; 2813 fNumberMatcher = NULL; 2814 return; 2815 } 2816 2817 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2818 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2819 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2820 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2821 2822 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2823 2824 fSets->addElement(fBK, status); 2825 fSets->addElement(fCR, status); 2826 fSets->addElement(fLF, status); 2827 fSets->addElement(fCM, status); 2828 fSets->addElement(fNL, status); 2829 fSets->addElement(fWJ, status); 2830 fSets->addElement(fZW, status); 2831 fSets->addElement(fGL, status); 2832 fSets->addElement(fCB, status); 2833 fSets->addElement(fSP, status); 2834 fSets->addElement(fB2, status); 2835 fSets->addElement(fBA, status); 2836 fSets->addElement(fBB, status); 2837 fSets->addElement(fHY, status); 2838 fSets->addElement(fH2, status); 2839 fSets->addElement(fH3, status); 2840 fSets->addElement(fCL, status); 2841 fSets->addElement(fCP, status); 2842 fSets->addElement(fEX, status); 2843 fSets->addElement(fIN, status); 2844 fSets->addElement(fJL, status); 2845 fSets->addElement(fJT, status); 2846 fSets->addElement(fJV, status); 2847 fSets->addElement(fNS, status); 2848 fSets->addElement(fOP, status); 2849 fSets->addElement(fQU, status); 2850 fSets->addElement(fIS, status); 2851 fSets->addElement(fNU, status); 2852 fSets->addElement(fPO, status); 2853 fSets->addElement(fPR, status); 2854 fSets->addElement(fSY, status); 2855 fSets->addElement(fAI, status); 2856 fSets->addElement(fAL, status); 2857 fSets->addElement(fHL, status); 2858 fSets->addElement(fID, status); 2859 fSets->addElement(fWJ, status); 2860 fSets->addElement(fRI, status); 2861 fSets->addElement(fSA, status); 2862 fSets->addElement(fSG, status); 2863 2864 const char *rules = 2865 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 2866 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 2867 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 2868 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 2869 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 2870 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 2871 2872 fNumberMatcher = new RegexMatcher( 2873 UnicodeString(rules, -1, US_INV), 0, status); 2874 2875 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2876 2877 if (U_FAILURE(status)) { 2878 deferredStatus = status; 2879 } 2880} 2881 2882 2883void RBBILineMonkey::setText(const UnicodeString &s) { 2884 fText = &s; 2885 fCharBI->setText(s); 2886 fNumberMatcher->reset(s); 2887} 2888 2889// 2890// rule9Adjust 2891// Line Break TR rules 9 and 10 implementation. 2892// This deals with combining marks and other sequences that 2893// that must be treated as if they were something other than what they actually are. 2894// 2895// This is factored out into a separate function because it must be applied twice for 2896// each potential break, once to the chars before the position being checked, then 2897// again to the text following the possible break. 2898// 2899void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 2900 if (pos == -1) { 2901 // Invalid initial position. Happens during the warmup iteration of the 2902 // main loop in next(). 2903 return; 2904 } 2905 2906 int32_t nPos = *nextPos; 2907 2908 // LB 9 Keep combining sequences together. 2909 // advance over any CM class chars. Note that Line Break CM is different 2910 // from the normal Grapheme Extend property. 2911 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 2912 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 2913 for (;;) { 2914 *nextChar = fText->char32At(nPos); 2915 if (!fCM->contains(*nextChar)) { 2916 break; 2917 } 2918 nPos = fText->moveIndex32(nPos, 1); 2919 } 2920 } 2921 2922 2923 // LB 9 Treat X CM* as if it were x. 2924 // No explicit action required. 2925 2926 // LB 10 Treat any remaining combining mark as AL 2927 if (fCM->contains(*posChar)) { 2928 *posChar = 0x41; // thisChar = 'A'; 2929 } 2930 2931 // Push the updated nextPos and nextChar back to our caller. 2932 // This only makes a difference if posChar got bigger by consuming a 2933 // combining sequence. 2934 *nextPos = nPos; 2935 *nextChar = fText->char32At(nPos); 2936} 2937 2938 2939 2940int32_t RBBILineMonkey::next(int32_t startPos) { 2941 UErrorCode status = U_ZERO_ERROR; 2942 int32_t pos; // Index of the char following a potential break position 2943 UChar32 thisChar; // Character at above position "pos" 2944 2945 int32_t prevPos; // Index of the char preceding a potential break position 2946 UChar32 prevChar; // Character at above position. Note that prevChar 2947 // and thisChar may not be adjacent because combining 2948 // characters between them will be ignored. 2949 2950 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 2951 UChar32 prevCharX2; 2952 2953 int32_t nextPos; // Index of the next character following pos. 2954 // Usually skips over combining marks. 2955 int32_t nextCPPos; // Index of the code point following "pos." 2956 // May point to a combining mark. 2957 int32_t tPos; // temp value. 2958 UChar32 c; 2959 2960 if (U_FAILURE(deferredStatus)) { 2961 return -1; 2962 } 2963 2964 if (startPos >= fText->length()) { 2965 return -1; 2966 } 2967 2968 2969 // Initial values for loop. Loop will run the first time without finding breaks, 2970 // while the invalid values shift out and the "this" and 2971 // "prev" positions are filled in with good values. 2972 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 2973 thisChar = prevChar = prevCharX2 = 0; 2974 nextPos = nextCPPos = startPos; 2975 2976 2977 // Loop runs once per position in the test text, until a break position 2978 // is found. 2979 for (;;) { 2980 prevPosX2 = prevPos; 2981 prevCharX2 = prevChar; 2982 2983 prevPos = pos; 2984 prevChar = thisChar; 2985 2986 pos = nextPos; 2987 thisChar = fText->char32At(pos); 2988 2989 nextCPPos = fText->moveIndex32(pos, 1); 2990 nextPos = nextCPPos; 2991 2992 // Rule LB2 - Break at end of text. 2993 if (pos >= fText->length()) { 2994 break; 2995 } 2996 2997 // Rule LB 9 - adjust for combining sequences. 2998 // We do this one out-of-order because the adjustment does not change anything 2999 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3000 // be applied. 3001 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3002 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3003 c = fText->char32At(nextPos); 3004 rule9Adjust(pos, &thisChar, &nextPos, &c); 3005 3006 // If the loop is still warming up - if we haven't shifted the initial 3007 // -1 positions out of prevPos yet - loop back to advance the 3008 // position in the input without any further looking for breaks. 3009 if (prevPos == -1) { 3010 continue; 3011 } 3012 3013 // LB 4 Always break after hard line breaks, 3014 if (fBK->contains(prevChar)) { 3015 break; 3016 } 3017 3018 // LB 5 Break after CR, LF, NL, but not inside CR LF 3019 if (prevChar == 0x0d && thisChar == 0x0a) { 3020 continue; 3021 } 3022 if (prevChar == 0x0d || 3023 prevChar == 0x0a || 3024 prevChar == 0x85) { 3025 break; 3026 } 3027 3028 // LB 6 Don't break before hard line breaks 3029 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3030 fBK->contains(thisChar)) { 3031 continue; 3032 } 3033 3034 3035 // LB 7 Don't break before spaces or zero-width space. 3036 if (fSP->contains(thisChar)) { 3037 continue; 3038 } 3039 3040 if (fZW->contains(thisChar)) { 3041 continue; 3042 } 3043 3044 // LB 8 Break after zero width space 3045 if (fZW->contains(prevChar)) { 3046 break; 3047 } 3048 3049 // LB 9, 10 Already done, at top of loop. 3050 // 3051 3052 3053 // LB 11 Do not break before or after WORD JOINER and related characters. 3054 // x WJ 3055 // WJ x 3056 // 3057 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3058 continue; 3059 } 3060 3061 // LB 12 3062 // GL x 3063 if (fGL->contains(prevChar)) { 3064 continue; 3065 } 3066 3067 // LB 12a 3068 // [^SP BA HY] x GL 3069 if (!(fSP->contains(prevChar) || 3070 fBA->contains(prevChar) || 3071 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3072 continue; 3073 } 3074 3075 3076 3077 // LB 13 Don't break before closings. 3078 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3079 // fall into LB 17 and the more general number regular expression. 3080 // 3081 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3082 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3083 fEX->contains(thisChar) || 3084 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3085 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3086 continue; 3087 } 3088 3089 // LB 14 Don't break after OP SP* 3090 // Scan backwards, checking for this sequence. 3091 // The OP char could include combining marks, so we actually check for 3092 // OP CM* SP* 3093 // Another Twist: The Rule 67 fixes may have changed a SP CM 3094 // sequence into a ID char, so before scanning back through spaces, 3095 // verify that prevChar is indeed a space. The prevChar variable 3096 // may differ from fText[prevPos] 3097 tPos = prevPos; 3098 if (fSP->contains(prevChar)) { 3099 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3100 tPos=fText->moveIndex32(tPos, -1); 3101 } 3102 } 3103 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3104 tPos=fText->moveIndex32(tPos, -1); 3105 } 3106 if (fOP->contains(fText->char32At(tPos))) { 3107 continue; 3108 } 3109 3110 3111 // LB 15 QU SP* x OP 3112 if (fOP->contains(thisChar)) { 3113 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3114 int tPos = prevPos; 3115 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3116 tPos = fText->moveIndex32(tPos, -1); 3117 } 3118 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3119 tPos = fText->moveIndex32(tPos, -1); 3120 } 3121 if (fQU->contains(fText->char32At(tPos))) { 3122 continue; 3123 } 3124 } 3125 3126 3127 3128 // LB 16 (CL | CP) SP* x NS 3129 // Scan backwards for SP* CM* (CL | CP) 3130 if (fNS->contains(thisChar)) { 3131 int tPos = prevPos; 3132 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3133 tPos = fText->moveIndex32(tPos, -1); 3134 } 3135 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3136 tPos = fText->moveIndex32(tPos, -1); 3137 } 3138 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3139 continue; 3140 } 3141 } 3142 3143 3144 // LB 17 B2 SP* x B2 3145 if (fB2->contains(thisChar)) { 3146 // Scan backwards, checking for the B2 CM* SP* sequence. 3147 tPos = prevPos; 3148 if (fSP->contains(prevChar)) { 3149 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3150 tPos=fText->moveIndex32(tPos, -1); 3151 } 3152 } 3153 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3154 tPos=fText->moveIndex32(tPos, -1); 3155 } 3156 if (fB2->contains(fText->char32At(tPos))) { 3157 continue; 3158 } 3159 } 3160 3161 3162 // LB 18 break after space 3163 if (fSP->contains(prevChar)) { 3164 break; 3165 } 3166 3167 // LB 19 3168 // x QU 3169 // QU x 3170 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3171 continue; 3172 } 3173 3174 // LB 20 Break around a CB 3175 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3176 break; 3177 } 3178 3179 // LB 21 3180 if (fBA->contains(thisChar) || 3181 fHY->contains(thisChar) || 3182 fNS->contains(thisChar) || 3183 fBB->contains(prevChar) ) { 3184 continue; 3185 } 3186 3187 // LB 21a 3188 // HL (HY | BA) x 3189 if (fHL->contains(prevCharX2) && 3190 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3191 continue; 3192 } 3193 3194 // LB 21b 3195 // SY x HL 3196 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3197 continue; 3198 } 3199 3200 // LB 22 3201 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3202 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3203 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3204 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3205 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3206 continue; 3207 } 3208 3209 3210 // LB 23 ID x PO 3211 // AL x NU 3212 // HL x NU 3213 // NU x AL 3214 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3215 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3216 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3217 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3218 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3219 continue; 3220 } 3221 3222 // LB 24 Do not break between prefix and letters or ideographs. 3223 // PR x ID 3224 // PR x (AL | HL) 3225 // PO x (AL | HL) 3226 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3227 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3228 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3229 continue; 3230 } 3231 3232 3233 3234 // LB 25 Numbers 3235 if (fNumberMatcher->lookingAt(prevPos, status)) { 3236 if (U_FAILURE(status)) { 3237 break; 3238 } 3239 // Matched a number. But could have been just a single digit, which would 3240 // not represent a "no break here" between prevChar and thisChar 3241 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3242 if (numEndIdx > pos) { 3243 // Number match includes at least our two chars being checked 3244 if (numEndIdx > nextPos) { 3245 // Number match includes additional chars. Update pos and nextPos 3246 // so that next loop iteration will continue at the end of the number, 3247 // checking for breaks between last char in number & whatever follows. 3248 pos = nextPos = numEndIdx; 3249 do { 3250 pos = fText->moveIndex32(pos, -1); 3251 thisChar = fText->char32At(pos); 3252 } while (fCM->contains(thisChar)); 3253 } 3254 continue; 3255 } 3256 } 3257 3258 3259 // LB 26 Do not break a Korean syllable. 3260 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3261 fJV->contains(thisChar) || 3262 fH2->contains(thisChar) || 3263 fH3->contains(thisChar))) { 3264 continue; 3265 } 3266 3267 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3268 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3269 continue; 3270 } 3271 3272 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3273 fJT->contains(thisChar)) { 3274 continue; 3275 } 3276 3277 // LB 27 Treat a Korean Syllable Block the same as ID. 3278 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3279 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3280 fIN->contains(thisChar)) { 3281 continue; 3282 } 3283 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3284 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3285 fPO->contains(thisChar)) { 3286 continue; 3287 } 3288 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3289 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3290 continue; 3291 } 3292 3293 3294 3295 // LB 28 Do not break between alphabetics ("at"). 3296 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3297 continue; 3298 } 3299 3300 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3301 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3302 continue; 3303 } 3304 3305 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3306 // (AL | NU) x OP 3307 // CP x (AL | NU) 3308 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3309 continue; 3310 } 3311 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3312 continue; 3313 } 3314 3315 // LB30a Do not break between regional indicators. 3316 // RI x RI 3317 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3318 continue; 3319 } 3320 3321 // LB 31 Break everywhere else 3322 break; 3323 3324 } 3325 3326 return pos; 3327} 3328 3329 3330UVector *RBBILineMonkey::charClasses() { 3331 return fSets; 3332} 3333 3334 3335RBBILineMonkey::~RBBILineMonkey() { 3336 delete fSets; 3337 3338 delete fBK; 3339 delete fCR; 3340 delete fLF; 3341 delete fCM; 3342 delete fNL; 3343 delete fWJ; 3344 delete fZW; 3345 delete fGL; 3346 delete fCB; 3347 delete fSP; 3348 delete fB2; 3349 delete fBA; 3350 delete fBB; 3351 delete fHY; 3352 delete fH2; 3353 delete fH3; 3354 delete fCL; 3355 delete fCP; 3356 delete fEX; 3357 delete fIN; 3358 delete fJL; 3359 delete fJV; 3360 delete fJT; 3361 delete fNS; 3362 delete fOP; 3363 delete fQU; 3364 delete fIS; 3365 delete fNU; 3366 delete fPO; 3367 delete fPR; 3368 delete fSY; 3369 delete fAI; 3370 delete fAL; 3371 delete fCJ; 3372 delete fHL; 3373 delete fID; 3374 delete fRI; 3375 delete fSA; 3376 delete fSG; 3377 delete fXX; 3378 3379 delete fCharBI; 3380 delete fNumberMatcher; 3381} 3382 3383 3384//------------------------------------------------------------------------------------------- 3385// 3386// TestMonkey 3387// 3388// params 3389// seed=nnnnn Random number starting seed. 3390// Setting the seed allows errors to be reproduced. 3391// loop=nnn Looping count. Controls running time. 3392// -1: run forever. 3393// 0 or greater: run length. 3394// 3395// type = char | word | line | sent | title 3396// 3397//------------------------------------------------------------------------------------------- 3398 3399static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3400 int32_t val = defaultVal; 3401 name.append(" *= *(-?\\d+)"); 3402 UErrorCode status = U_ZERO_ERROR; 3403 RegexMatcher m(name, params, 0, status); 3404 if (m.find()) { 3405 // The param exists. Convert the string to an int. 3406 char valString[100]; 3407 int32_t paramLength = m.end(1, status) - m.start(1, status); 3408 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3409 paramLength = (int32_t)(sizeof(valString)-2); 3410 } 3411 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3412 val = strtol(valString, NULL, 10); 3413 3414 // Delete this parameter from the params string. 3415 m.reset(); 3416 params = m.replaceFirst("", status); 3417 } 3418 U_ASSERT(U_SUCCESS(status)); 3419 return val; 3420} 3421#endif 3422 3423#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3424static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3425 BreakIterator *bi, 3426 int expected[], 3427 int expectedcount) 3428{ 3429 int count = 0; 3430 int i = 0; 3431 int forward[50]; 3432 bi->setText(ustr); 3433 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3434 forward[count] = i; 3435 if (count < expectedcount && expected[count] != i) { 3436 test->errln("break forward test failed: expected %d but got %d", 3437 expected[count], i); 3438 break; 3439 } 3440 count ++; 3441 } 3442 if (count != expectedcount) { 3443 printStringBreaks(ustr, expected, expectedcount); 3444 test->errln("break forward test failed: missed %d match", 3445 expectedcount - count); 3446 return; 3447 } 3448 // testing boundaries 3449 for (i = 1; i < expectedcount; i ++) { 3450 int j = expected[i - 1]; 3451 if (!bi->isBoundary(j)) { 3452 printStringBreaks(ustr, expected, expectedcount); 3453 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3454 return; 3455 } 3456 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3457 if (bi->isBoundary(j)) { 3458 printStringBreaks(ustr, expected, expectedcount); 3459 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3460 return; 3461 } 3462 } 3463 } 3464 3465 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3466 count --; 3467 if (forward[count] != i) { 3468 printStringBreaks(ustr, expected, expectedcount); 3469 test->errln("happy break test previous() failed: expected %d but got %d", 3470 forward[count], i); 3471 break; 3472 } 3473 } 3474 if (count != 0) { 3475 printStringBreaks(ustr, expected, expectedcount); 3476 test->errln("break test previous() failed: missed a match"); 3477 return; 3478 } 3479 3480 // testing preceding 3481 for (i = 0; i < expectedcount - 1; i ++) { 3482 // int j = expected[i] + 1; 3483 int j = ustr.moveIndex32(expected[i], 1); 3484 for (; j <= expected[i + 1]; j ++) { 3485 if (bi->preceding(j) != expected[i]) { 3486 printStringBreaks(ustr, expected, expectedcount); 3487 test->errln("preceding(): Not expecting boundary at position %d", j); 3488 return; 3489 } 3490 } 3491 } 3492} 3493#endif 3494 3495void RBBITest::TestWordBreaks(void) 3496{ 3497#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3498 3499 Locale locale("en"); 3500 UErrorCode status = U_ZERO_ERROR; 3501 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3502 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3503 // Replaced any C+J characters in a row with a random sequence of characters 3504 // of the same length to make our C+J segmentation not get in the way. 3505 static const char *strlist[] = 3506 { 3507 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3508 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3509 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3510 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3511 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3512 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3513 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3514 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3515 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3516 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3517 "\\u2027\\U000e0067\\u0a47\\u00b7", 3518 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3519 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3520 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3521 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3522 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3523 "\\u0027\\u11af\\U000e0057\\u0602", 3524 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3525 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3526 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3527 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3528 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3529 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3530 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3531 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3532 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3533 "\\u18f4\\U000e0049\\u20e7\\u2027", 3534 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3535 "\\ua183\\u102d\\u0bec\\u003a", 3536 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3537 "\\u003a\\u0e57\\u0fad\\u002e", 3538 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3539 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3540 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3541 "\\u003a\\u0664\\u00b7\\u1fba", 3542 "\\u003b\\u0027\\u00b7\\u47a3", 3543 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3544 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3545 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3546 }; 3547 int loop; 3548 if (U_FAILURE(status)) { 3549 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3550 return; 3551 } 3552 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3553 // printf("looping %d\n", loop); 3554 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3555 // RBBICharMonkey monkey; 3556 RBBIWordMonkey monkey; 3557 3558 int expected[50]; 3559 int expectedcount = 0; 3560 3561 monkey.setText(ustr); 3562 int i; 3563 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3564 expected[expectedcount ++] = i; 3565 } 3566 3567 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3568 } 3569 delete bi; 3570#endif 3571} 3572 3573void RBBITest::TestWordBoundary(void) 3574{ 3575 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3576 Locale locale("en"); 3577 UErrorCode status = U_ZERO_ERROR; 3578 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3579 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3580 UChar str[50]; 3581 static const char *strlist[] = 3582 { 3583 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3584 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3585 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3586 "\\u2027\\U000e0067\\u0a47\\u00b7", 3587 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3588 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3589 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3590 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3591 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3592 "\\u0027\\u11af\\U000e0057\\u0602", 3593 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3594 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3595 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3596 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3597 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3598 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3599 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3600 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3601 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3602 "\\u58f4\\U000e0049\\u20e7\\u2027", 3603 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3604 "\\ua183\\u102d\\u0bec\\u003a", 3605 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3606 "\\u003a\\u0e57\\u0fad\\u002e", 3607 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3608 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3609 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3610 "\\u003a\\u0664\\u00b7\\u1fba", 3611 "\\u003b\\u0027\\u00b7\\u47a3", 3612 }; 3613 int loop; 3614 if (U_FAILURE(status)) { 3615 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3616 return; 3617 } 3618 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3619 // printf("looping %d\n", loop); 3620 u_unescape(strlist[loop], str, 20); 3621 UnicodeString ustr(str); 3622 int forward[50]; 3623 int count = 0; 3624 3625 bi->setText(ustr); 3626 int prev = 0; 3627 int i; 3628 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3629 forward[count ++] = i; 3630 if (i > prev) { 3631 int j; 3632 for (j = prev + 1; j < i; j ++) { 3633 if (bi->isBoundary(j)) { 3634 printStringBreaks(ustr, forward, count); 3635 errln("happy boundary test failed: expected %d not a boundary", 3636 j); 3637 return; 3638 } 3639 } 3640 } 3641 if (!bi->isBoundary(i)) { 3642 printStringBreaks(ustr, forward, count); 3643 errln("happy boundary test failed: expected %d a boundary", 3644 i); 3645 return; 3646 } 3647 prev = i; 3648 } 3649 } 3650 delete bi; 3651} 3652 3653void RBBITest::TestLineBreaks(void) 3654{ 3655#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3656 Locale locale("en"); 3657 UErrorCode status = U_ZERO_ERROR; 3658 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3659 const int32_t STRSIZE = 50; 3660 UChar str[STRSIZE]; 3661 static const char *strlist[] = 3662 { 3663 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3664 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3665 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3666 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3667 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3668 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3669 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3670 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3671 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3672 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3673 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3674 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3675 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3676 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3677 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3678 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3679 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3680 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3681 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3682 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3683 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3684 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3685 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3686 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3687 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3688 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3689 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3690 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3691 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3692 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3693 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3694 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3695 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3696 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3697 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3698 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3699 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3700 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3701 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3702 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3703 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3704 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3705 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3706 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3707 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3708 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3709 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3710 }; 3711 int loop; 3712 TEST_ASSERT_SUCCESS(status); 3713 if (U_FAILURE(status)) { 3714 return; 3715 } 3716 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3717 // printf("looping %d\n", loop); 3718 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3719 if (t >= STRSIZE) { 3720 TEST_ASSERT(FALSE); 3721 continue; 3722 } 3723 3724 3725 UnicodeString ustr(str); 3726 RBBILineMonkey monkey; 3727 if (U_FAILURE(monkey.deferredStatus)) { 3728 continue; 3729 } 3730 3731 const int EXPECTEDSIZE = 50; 3732 int expected[EXPECTEDSIZE]; 3733 int expectedcount = 0; 3734 3735 monkey.setText(ustr); 3736 int i; 3737 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3738 if (expectedcount >= EXPECTEDSIZE) { 3739 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3740 return; 3741 } 3742 expected[expectedcount ++] = i; 3743 } 3744 3745 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3746 } 3747 delete bi; 3748#endif 3749} 3750 3751void RBBITest::TestSentBreaks(void) 3752{ 3753#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3754 Locale locale("en"); 3755 UErrorCode status = U_ZERO_ERROR; 3756 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3757 UChar str[200]; 3758 static const char *strlist[] = 3759 { 3760 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3761 "This\n", 3762 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3763 "\"Sentence ending with a quote.\" Bye.", 3764 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3765 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3766 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3767 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3768 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3769 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3770 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3771 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3772 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3773 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3774 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3775 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3776 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3777 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3778 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3779 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3780 }; 3781 int loop; 3782 if (U_FAILURE(status)) { 3783 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3784 return; 3785 } 3786 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3787 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3788 UnicodeString ustr(str); 3789 3790 RBBISentMonkey monkey; 3791 if (U_FAILURE(monkey.deferredStatus)) { 3792 continue; 3793 } 3794 3795 const int EXPECTEDSIZE = 50; 3796 int expected[EXPECTEDSIZE]; 3797 int expectedcount = 0; 3798 3799 monkey.setText(ustr); 3800 int i; 3801 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3802 if (expectedcount >= EXPECTEDSIZE) { 3803 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3804 return; 3805 } 3806 expected[expectedcount ++] = i; 3807 } 3808 3809 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3810 } 3811 delete bi; 3812#endif 3813} 3814 3815void RBBITest::TestMonkey(char *params) { 3816#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3817 3818 UErrorCode status = U_ZERO_ERROR; 3819 int32_t loopCount = 500; 3820 int32_t seed = 1; 3821 UnicodeString breakType = "all"; 3822 Locale locale("en"); 3823 UBool useUText = FALSE; 3824 3825 if (quick == FALSE) { 3826 loopCount = 10000; 3827 } 3828 3829 if (params) { 3830 UnicodeString p(params); 3831 loopCount = getIntParam("loop", p, loopCount); 3832 seed = getIntParam("seed", p, seed); 3833 3834 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 3835 if (m.find()) { 3836 breakType = m.group(1, status); 3837 m.reset(); 3838 p = m.replaceFirst("", status); 3839 } 3840 3841 RegexMatcher u(" *utext", p, 0, status); 3842 if (u.find()) { 3843 useUText = TRUE; 3844 u.reset(); 3845 p = u.replaceFirst("", status); 3846 } 3847 3848 3849 // m.reset(p); 3850 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 3851 // Each option is stripped out of the option string as it is processed. 3852 // All options have been checked. The option string should have been completely emptied.. 3853 char buf[100]; 3854 p.extract(buf, sizeof(buf), NULL, status); 3855 buf[sizeof(buf)-1] = 0; 3856 errln("Unrecognized or extra parameter: %s\n", buf); 3857 return; 3858 } 3859 3860 } 3861 3862 if (breakType == "char" || breakType == "all") { 3863 RBBICharMonkey m; 3864 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3865 if (U_SUCCESS(status)) { 3866 RunMonkey(bi, m, "char", seed, loopCount, useUText); 3867 if (breakType == "all" && useUText==FALSE) { 3868 // Also run a quick test with UText when "all" is specified 3869 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 3870 } 3871 } 3872 else { 3873 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 3874 } 3875 delete bi; 3876 } 3877 3878 if (breakType == "word" || breakType == "all") { 3879 logln("Word Break Monkey Test"); 3880 RBBIWordMonkey m; 3881 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3882 if (U_SUCCESS(status)) { 3883 RunMonkey(bi, m, "word", seed, loopCount, useUText); 3884 } 3885 else { 3886 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 3887 } 3888 delete bi; 3889 } 3890 3891 if (breakType == "line" || breakType == "all") { 3892 logln("Line Break Monkey Test"); 3893 RBBILineMonkey m; 3894 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3895 if (loopCount >= 10) { 3896 loopCount = loopCount / 5; // Line break runs slower than the others. 3897 } 3898 if (U_SUCCESS(status)) { 3899 RunMonkey(bi, m, "line", seed, loopCount, useUText); 3900 } 3901 else { 3902 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3903 } 3904 delete bi; 3905 } 3906 3907 if (breakType == "sent" || breakType == "all" ) { 3908 logln("Sentence Break Monkey Test"); 3909 RBBISentMonkey m; 3910 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3911 if (loopCount >= 10) { 3912 loopCount = loopCount / 10; // Sentence runs slower than the other break types 3913 } 3914 if (U_SUCCESS(status)) { 3915 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 3916 } 3917 else { 3918 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3919 } 3920 delete bi; 3921 } 3922 3923#endif 3924} 3925 3926// 3927// Run a RBBI monkey test. Common routine, for all break iterator types. 3928// Parameters: 3929// bi - the break iterator to use 3930// mk - MonkeyKind, abstraction for obtaining expected results 3931// name - Name of test (char, word, etc.) for use in error messages 3932// seed - Seed for starting random number generator (parameter from user) 3933// numIterations 3934// 3935void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 3936 int32_t numIterations, UBool useUText) { 3937 3938#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3939 3940 const int32_t TESTSTRINGLEN = 500; 3941 UnicodeString testText; 3942 int32_t numCharClasses; 3943 UVector *chClasses; 3944 int expected[TESTSTRINGLEN*2 + 1]; 3945 int expectedCount = 0; 3946 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 3947 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 3948 char reverseBreaks[TESTSTRINGLEN*2+1]; 3949 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 3950 char followingBreaks[TESTSTRINGLEN*2+1]; 3951 char precedingBreaks[TESTSTRINGLEN*2+1]; 3952 int i; 3953 int loopCount = 0; 3954 3955 m_seed = seed; 3956 3957 numCharClasses = mk.charClasses()->size(); 3958 chClasses = mk.charClasses(); 3959 3960 // Check for errors that occured during the construction of the MonkeyKind object. 3961 // Can't report them where they occured because errln() is a method coming from intlTest, 3962 // and is not visible outside of RBBITest :-( 3963 if (U_FAILURE(mk.deferredStatus)) { 3964 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 3965 return; 3966 } 3967 3968 // Verify that the character classes all have at least one member. 3969 for (i=0; i<numCharClasses; i++) { 3970 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 3971 if (s == NULL || s->size() == 0) { 3972 errln("Character Class #%d is null or of zero size.", i); 3973 return; 3974 } 3975 } 3976 3977 while (loopCount < numIterations || numIterations == -1) { 3978 if (numIterations == -1 && loopCount % 10 == 0) { 3979 // If test is running in an infinite loop, display a periodic tic so 3980 // we can tell that it is making progress. 3981 fprintf(stderr, "."); 3982 } 3983 // Save current random number seed, so that we can recreate the random numbers 3984 // for this loop iteration in event of an error. 3985 seed = m_seed; 3986 3987 // Populate a test string with data. 3988 testText.truncate(0); 3989 for (i=0; i<TESTSTRINGLEN; i++) { 3990 int32_t aClassNum = m_rand() % numCharClasses; 3991 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 3992 int32_t charIdx = m_rand() % classSet->size(); 3993 UChar32 c = classSet->charAt(charIdx); 3994 if (c < 0) { // TODO: deal with sets containing strings. 3995 errln("c < 0"); 3996 break; 3997 } 3998 testText.append(c); 3999 } 4000 4001 // Calculate the expected results for this test string. 4002 mk.setText(testText); 4003 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4004 expectedBreaks[0] = 1; 4005 int32_t breakPos = 0; 4006 expectedCount = 0; 4007 for (;;) { 4008 breakPos = mk.next(breakPos); 4009 if (breakPos == -1) { 4010 break; 4011 } 4012 if (breakPos > testText.length()) { 4013 errln("breakPos > testText.length()"); 4014 } 4015 expectedBreaks[breakPos] = 1; 4016 U_ASSERT(expectedCount<testText.length()); 4017 expected[expectedCount ++] = breakPos; 4018 (void)expected; // Set but not used warning. 4019 // TODO (andy): check it out. 4020 } 4021 4022 // Find the break positions using forward iteration 4023 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4024 if (useUText) { 4025 UErrorCode status = U_ZERO_ERROR; 4026 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4027 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4028 bi->setText(testUText, status); 4029 TEST_ASSERT_SUCCESS(status); 4030 utext_close(testUText); // The break iterator does a shallow clone of the UText 4031 // This UText can be closed immediately, so long as the 4032 // testText string continues to exist. 4033 } else { 4034 bi->setText(testText); 4035 } 4036 4037 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4038 if (i < 0 || i > testText.length()) { 4039 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4040 break; 4041 } 4042 forwardBreaks[i] = 1; 4043 } 4044 4045 // Find the break positions using reverse iteration 4046 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4047 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4048 if (i < 0 || i > testText.length()) { 4049 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4050 break; 4051 } 4052 reverseBreaks[i] = 1; 4053 } 4054 4055 // Find the break positions using isBoundary() tests. 4056 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4057 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4058 for (i=0; i<=testText.length(); i++) { 4059 isBoundaryBreaks[i] = bi->isBoundary(i); 4060 } 4061 4062 4063 // Find the break positions using the following() function. 4064 // printf("."); 4065 memset(followingBreaks, 0, sizeof(followingBreaks)); 4066 int32_t lastBreakPos = 0; 4067 followingBreaks[0] = 1; 4068 for (i=0; i<testText.length(); i++) { 4069 breakPos = bi->following(i); 4070 if (breakPos <= i || 4071 breakPos < lastBreakPos || 4072 breakPos > testText.length() || 4073 (breakPos > lastBreakPos && lastBreakPos > i)) { 4074 UChar32 brkChar = testText.char32At(lastBreakPos); 4075 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests 4076 errln("%s break monkey test: " 4077 "Out of range value returned by BreakIterator::following().\n" 4078 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4079 name, seed, i, breakPos, lastBreakPos); 4080 } 4081 break; 4082 } 4083 followingBreaks[breakPos] = 1; 4084 lastBreakPos = breakPos; 4085 } 4086 4087 // Find the break positions using the preceding() function. 4088 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4089 lastBreakPos = testText.length(); 4090 precedingBreaks[testText.length()] = 1; 4091 for (i=testText.length(); i>0; i--) { 4092 breakPos = bi->preceding(i); 4093 if (breakPos >= i || 4094 breakPos > lastBreakPos || 4095 (breakPos < 0 && testText.getChar32Start(i)>0) || 4096 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4097 UChar32 brkChar = testText.char32At(breakPos); 4098 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests 4099 errln("%s break monkey test: " 4100 "Out of range value returned by BreakIterator::preceding().\n" 4101 "index=%d; prev returned %d; lastBreak=%d" , 4102 name, i, breakPos, lastBreakPos); 4103 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4104 precedingBreaks[i] = 2; // Forces an error. 4105 } 4106 } 4107 } else { 4108 if (breakPos >= 0) { 4109 precedingBreaks[breakPos] = 1; 4110 } 4111 lastBreakPos = breakPos; 4112 } 4113 } 4114 4115 // Compare the expected and actual results. 4116 for (i=0; i<=testText.length(); i++) { 4117 const char *errorType = NULL; 4118 if (forwardBreaks[i] != expectedBreaks[i]) { 4119 errorType = "next()"; 4120 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4121 errorType = "previous()"; 4122 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4123 errorType = "isBoundary()"; 4124 } else if (followingBreaks[i] != expectedBreaks[i]) { 4125 errorType = "following()"; 4126 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4127 errorType = "preceding()"; 4128 } 4129 4130 4131 if (errorType != NULL) { 4132 // Format a range of the test text that includes the failure as 4133 // a data item that can be included in the rbbi test data file. 4134 4135 // Start of the range is the last point where expected and actual results 4136 // both agreed that there was a break position. 4137 int startContext = i; 4138 int32_t count = 0; 4139 for (;;) { 4140 if (startContext==0) { break; } 4141 startContext --; 4142 if (expectedBreaks[startContext] != 0) { 4143 if (count == 2) break; 4144 count ++; 4145 } 4146 } 4147 4148 // End of range is two expected breaks past the start position. 4149 int endContext = i + 1; 4150 int ci; 4151 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4152 for (;;) { 4153 if (endContext >= testText.length()) {break;} 4154 if (expectedBreaks[endContext-1] != 0) { 4155 if (count == 0) break; 4156 count --; 4157 } 4158 endContext ++; 4159 } 4160 } 4161 4162 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4163 UnicodeString errorText = "<data>"; 4164 /***if (strcmp(errorType, "next()") == 0) { 4165 startContext = 0; 4166 endContext = testText.length(); 4167 4168 printStringBreaks(testText, expected, expectedCount); 4169 }***/ 4170 4171 for (ci=startContext; ci<endContext;) { 4172 UnicodeString hexChars("0123456789abcdef"); 4173 UChar32 c; 4174 int bn; 4175 c = testText.char32At(ci); 4176 if (ci == i) { 4177 // This is the location of the error. 4178 errorText.append("<?>"); 4179 } else if (expectedBreaks[ci] != 0) { 4180 // This a non-error expected break position. 4181 errorText.append("\\"); 4182 } 4183 if (c < 0x10000) { 4184 errorText.append("\\u"); 4185 for (bn=12; bn>=0; bn-=4) { 4186 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4187 } 4188 } else { 4189 errorText.append("\\U"); 4190 for (bn=28; bn>=0; bn-=4) { 4191 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4192 } 4193 } 4194 ci = testText.moveIndex32(ci, 1); 4195 } 4196 errorText.append("\\"); 4197 errorText.append("</data>\n"); 4198 4199 // Output the error 4200 char charErrorTxt[500]; 4201 UErrorCode status = U_ZERO_ERROR; 4202 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4203 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4204 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4205 4206 UChar32 brkChar = testText.char32At(i); 4207 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests 4208 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4209 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4210 errorType, seed, i, charErrorTxt); 4211 } 4212 break; 4213 } 4214 } 4215 4216 loopCount++; 4217 } 4218#endif 4219} 4220 4221 4222// Bug 5532. UTF-8 based UText fails in dictionary code. 4223// This test checks the initial patch, 4224// which is to just keep it from crashing. Correct word boundaries 4225// await a proper fix to the dictionary code. 4226// 4227void RBBITest::TestBug5532(void) { 4228 // Text includes a mixture of Thai and Latin. 4229 const unsigned char utf8Data[] = { 4230 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4231 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4232 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4233 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4234 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4235 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4236 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4237 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4238 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4239 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4240 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4241 4242 UErrorCode status = U_ZERO_ERROR; 4243 UText utext=UTEXT_INITIALIZER; 4244 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4245 TEST_ASSERT_SUCCESS(status); 4246 4247 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4248 TEST_ASSERT_SUCCESS(status); 4249 if (U_SUCCESS(status)) { 4250 bi->setText(&utext, status); 4251 TEST_ASSERT_SUCCESS(status); 4252 4253 int32_t breakCount = 0; 4254 int32_t previousBreak = -1; 4255 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4256 // For now, just make sure that the break iterator doesn't hang. 4257 TEST_ASSERT(previousBreak < bi->current()); 4258 previousBreak = bi->current(); 4259 } 4260 TEST_ASSERT(breakCount > 0); 4261 } 4262 delete bi; 4263 utext_close(&utext); 4264} 4265 4266 4267void RBBITest::TestBug9983(void) { 4268 UnicodeString text = UnicodeString("\\u002A" // * Other 4269 "\\uFF65" // Other 4270 "\\u309C" // Katakana 4271 "\\uFF9F" // Extend 4272 "\\uFF65" // Other 4273 "\\u0020" // Other 4274 "\\u0000").unescape(); 4275 4276 UErrorCode status = U_ZERO_ERROR; 4277 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4278 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4279 TEST_ASSERT_SUCCESS(status); 4280 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4281 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4282 TEST_ASSERT_SUCCESS(status); 4283 if (U_FAILURE(status)) { 4284 return; 4285 } 4286 int32_t offset, rstatus, iterationCount; 4287 4288 brkiter->setText(text); 4289 brkiter->last(); 4290 iterationCount = 0; 4291 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4292 iterationCount++; 4293 rstatus = brkiter->getRuleStatus(); 4294 (void)rstatus; // Suppress set but not used warning. 4295 if (iterationCount >= 10) { 4296 break; 4297 } 4298 } 4299 TEST_ASSERT(iterationCount == 6); 4300 4301 brkiterPOSIX->setText(text); 4302 brkiterPOSIX->last(); 4303 iterationCount = 0; 4304 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4305 iterationCount++; 4306 rstatus = brkiterPOSIX->getRuleStatus(); 4307 (void)rstatus; // Suppress set but not used warning. 4308 if (iterationCount >= 10) { 4309 break; 4310 } 4311 } 4312 TEST_ASSERT(iterationCount == 6); 4313} 4314 4315 4316// 4317// TestDebug - A place-holder test for debugging purposes. 4318// For putting in fragments of other tests that can be invoked 4319// for tracing without a lot of unwanted extra stuff happening. 4320// 4321void RBBITest::TestDebug(void) { 4322#if 0 4323 UErrorCode status = U_ZERO_ERROR; 4324 int pos = 0; 4325 int ruleStatus = 0; 4326 4327 RuleBasedBreakIterator* bi = 4328 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4329 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4330 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4331 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4332 // UnicodeString s("Aaa. Bcd"); 4333 s = s.unescape(); 4334 bi->setText(s); 4335 UBool r = bi->isBoundary(8); 4336 printf("%s", r?"true":"false"); 4337 return; 4338 pos = bi->last(); 4339 do { 4340 // ruleStatus = bi->getRuleStatus(); 4341 printf("%d\t%d\n", pos, ruleStatus); 4342 pos = bi->previous(); 4343 } while (pos != BreakIterator::DONE); 4344#endif 4345} 4346 4347void RBBITest::TestProperties() { 4348 UErrorCode errorCode = U_ZERO_ERROR; 4349 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4350 if (!prependSet.isEmpty()) { 4351 errln( 4352 "[:GCB=Prepend:] is not empty any more. " 4353 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4354 "change this test to the opposite condition."); 4355 } 4356} 4357 4358#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4359