1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/regex.h"
27#include "unicode/uchar.h"
28#include "unicode/ucnv.h"
29#include "unicode/uniset.h"
30#include "unicode/ustring.h"
31#include "regextst.h"
32#include "uvector.h"
33#include "util.h"
34#include <stdlib.h>
35#include <string.h>
36#include <stdio.h>
37#include "cstring.h"
38#include "uinvchar.h"
39
40#define SUPPORT_MUTATING_INPUT_STRING   0
41
42//---------------------------------------------------------------------------
43//
44//  Test class boilerplate
45//
46//---------------------------------------------------------------------------
47RegexTest::RegexTest()
48{
49}
50
51
52RegexTest::~RegexTest()
53{
54}
55
56
57
58void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
59{
60    if (exec) logln("TestSuite RegexTest: ");
61    switch (index) {
62
63        case 0: name = "Basic";
64            if (exec) Basic();
65            break;
66        case 1: name = "API_Match";
67            if (exec) API_Match();
68            break;
69        case 2: name = "API_Replace";
70            if (exec) API_Replace();
71            break;
72        case 3: name = "API_Pattern";
73            if (exec) API_Pattern();
74            break;
75        case 4:
76#if !UCONFIG_NO_FILE_IO
77            name = "Extended";
78            if (exec) Extended();
79#else
80            name = "skip";
81#endif
82            break;
83        case 5: name = "Errors";
84            if (exec) Errors();
85            break;
86        case 6: name = "PerlTests";
87            if (exec) PerlTests();
88            break;
89        case 7: name = "Callbacks";
90            if (exec) Callbacks();
91            break;
92        case 8: name = "FindProgressCallbacks";
93            if (exec) FindProgressCallbacks();
94            break;
95        case 9: name = "Bug 6149";
96             if (exec) Bug6149();
97             break;
98        case 10: name = "UTextBasic";
99          if (exec) UTextBasic();
100          break;
101        case 11: name = "API_Match_UTF8";
102          if (exec) API_Match_UTF8();
103          break;
104        case 12: name = "API_Replace_UTF8";
105          if (exec) API_Replace_UTF8();
106          break;
107        case 13: name = "API_Pattern_UTF8";
108          if (exec) API_Pattern_UTF8();
109          break;
110        case 14: name = "PerlTestsUTF8";
111          if (exec) PerlTestsUTF8();
112          break;
113        case 15: name = "PreAllocatedUTextCAPI";
114          if (exec) PreAllocatedUTextCAPI();
115          break;
116        case 16: name = "Bug 7651";
117             if (exec) Bug7651();
118             break;
119        case 17: name = "Bug 7740";
120            if (exec) Bug7740();
121            break;
122        case 18: name = "Bug 8479";
123            if (exec) Bug8479();
124            break;
125        case 19: name = "Bug 7029";
126            if (exec) Bug7029();
127            break;
128        case 20: name = "CheckInvBufSize";
129            if (exec) CheckInvBufSize();
130            break;
131        case 21: name = "Bug 9283";
132            if (exec) Bug9283();
133            break;
134
135        default: name = "";
136            break; //needed to end loop
137    }
138}
139
140
141
142/**
143 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
144 * into ASCII.
145 * @see utext_openUTF8
146 */
147static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
148
149//---------------------------------------------------------------------------
150//
151//   Error Checking / Reporting macros used in all of the tests.
152//
153//---------------------------------------------------------------------------
154
155static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
156  int64_t oldIndex = utext_getNativeIndex(text);
157  utext_setNativeIndex(text, 0);
158  char *bufPtr = buf;
159  UChar32 c = utext_next32From(text, 0);
160  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
161    if (0x000020<=c && c<0x00007e) {
162      *bufPtr = c;
163    } else {
164#if 0
165      sprintf(bufPtr,"U+%04X", c);
166      bufPtr+= strlen(bufPtr)-1;
167#else
168      *bufPtr = '%';
169#endif
170    }
171    bufPtr++;
172    c = UTEXT_NEXT32(text);
173  }
174  *bufPtr = 0;
175#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
176  char *ebuf = (char*)malloc(bufLen);
177  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
178  uprv_strncpy(buf, ebuf, bufLen);
179  free((void*)ebuf);
180#endif
181  utext_setNativeIndex(text, oldIndex);
182}
183
184
185static char ASSERT_BUF[1024];
186
187const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
188  if(message.length()==0) {
189    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
190  } else {
191    UnicodeString buf;
192    IntlTest::prettify(message,buf);
193    if(buf.length()==0) {
194      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
195    } else {
196      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
197      if(ASSERT_BUF[0]==0) {
198        ASSERT_BUF[0]=0;
199        for(int32_t i=0;i<buf.length();i++) {
200          UChar ch = buf[i];
201          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
202        }
203      }
204    }
205  }
206  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
207  return ASSERT_BUF;
208}
209
210
211#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
212
213#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
214                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
215
216#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
217
218#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
219if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
220    __LINE__, u_errorName(errcode), u_errorName(status));};}
221
222#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
223    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
224
225#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
226    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
227
228#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
229
230
231static UBool testUTextEqual(UText *uta, UText *utb) {
232    UChar32 ca = 0;
233    UChar32 cb = 0;
234    utext_setNativeIndex(uta, 0);
235    utext_setNativeIndex(utb, 0);
236    do {
237        ca = utext_next32(uta);
238        cb = utext_next32(utb);
239        if (ca != cb) {
240            break;
241        }
242    } while (ca != U_SENTINEL);
243    return ca == cb;
244}
245
246
247/**
248 * @param expected expected text in UTF-8 (not platform) codepage
249 */
250void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
251    UErrorCode status = U_ZERO_ERROR;
252    UText expectedText = UTEXT_INITIALIZER;
253    utext_openUTF8(&expectedText, expected, -1, &status);
254    if(U_FAILURE(status)) {
255      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
256      return;
257    }
258    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
259      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
260      return;
261    }
262    utext_setNativeIndex(actual, 0);
263    if (!testUTextEqual(&expectedText, actual)) {
264        char buf[201 /*21*/];
265        char expectedBuf[201];
266        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
267        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
268        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
269    }
270    utext_close(&expectedText);
271}
272/**
273 * @param expected invariant (platform local text) input
274 */
275
276void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
277    UErrorCode status = U_ZERO_ERROR;
278    UText expectedText = UTEXT_INITIALIZER;
279    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
280    if(U_FAILURE(status)) {
281      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
282      return;
283    }
284    utext_setNativeIndex(actual, 0);
285    if (!testUTextEqual(&expectedText, actual)) {
286        char buf[201 /*21*/];
287        char expectedBuf[201];
288        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
289        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
290        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
291    }
292    utext_close(&expectedText);
293}
294
295/**
296 * Assumes utf-8 input
297 */
298#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
299/**
300 * Assumes Invariant input
301 */
302#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
303
304/**
305 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
306 * passed into utext_openUTF8. An error will be given if
307 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
308 */
309
310#define INV_BUFSIZ 2048 /* increase this if too small */
311
312static int64_t inv_next=0;
313
314#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
315static char inv_buf[INV_BUFSIZ];
316#endif
317
318static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
319  if(length==-1) length=strlen(inv);
320#if U_CHARSET_FAMILY==U_ASCII_FAMILY
321  inv_next+=length;
322  return utext_openUTF8(ut, inv, length, status);
323#else
324  if(inv_next+length+1>INV_BUFSIZ) {
325    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
326            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
327    *status = U_MEMORY_ALLOCATION_ERROR;
328    return NULL;
329  }
330
331  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
332  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
333  inv_next+=length;
334
335#if 0
336  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
337#endif
338
339  return utext_openUTF8(ut, (const char*)buf, length, status);
340#endif
341}
342
343
344//---------------------------------------------------------------------------
345//
346//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
347//                       for the LookingAt() and  Match() functions.
348//
349//       usage:
350//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
351//
352//          The expected results are UBool - TRUE or FALSE.
353//          The input text is unescaped.  The pattern is not.
354//
355//
356//---------------------------------------------------------------------------
357
358#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
359
360UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
361    const UnicodeString pattern(pat, -1, US_INV);
362    const UnicodeString inputText(text, -1, US_INV);
363    UErrorCode          status  = U_ZERO_ERROR;
364    UParseError         pe;
365    RegexPattern        *REPattern = NULL;
366    RegexMatcher        *REMatcher = NULL;
367    UBool               retVal     = TRUE;
368
369    UnicodeString patString(pat, -1, US_INV);
370    REPattern = RegexPattern::compile(patString, 0, pe, status);
371    if (U_FAILURE(status)) {
372        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
373            line, u_errorName(status));
374        return FALSE;
375    }
376    if (line==376) { RegexPatternDump(REPattern);}
377
378    UnicodeString inputString(inputText);
379    UnicodeString unEscapedInput = inputString.unescape();
380    REMatcher = REPattern->matcher(unEscapedInput, status);
381    if (U_FAILURE(status)) {
382        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
383            line, u_errorName(status));
384        return FALSE;
385    }
386
387    UBool actualmatch;
388    actualmatch = REMatcher->lookingAt(status);
389    if (U_FAILURE(status)) {
390        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
391            line, u_errorName(status));
392        retVal =  FALSE;
393    }
394    if (actualmatch != looking) {
395        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
396        retVal = FALSE;
397    }
398
399    status = U_ZERO_ERROR;
400    actualmatch = REMatcher->matches(status);
401    if (U_FAILURE(status)) {
402        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
403            line, u_errorName(status));
404        retVal = FALSE;
405    }
406    if (actualmatch != match) {
407        errln("RegexTest: wrong return from matches() at line %d.\n", line);
408        retVal = FALSE;
409    }
410
411    if (retVal == FALSE) {
412        RegexPatternDump(REPattern);
413    }
414
415    delete REPattern;
416    delete REMatcher;
417    return retVal;
418}
419
420
421UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
422    UText               pattern    = UTEXT_INITIALIZER;
423    int32_t             inputUTF8Length;
424    char                *textChars = NULL;
425    UText               inputText  = UTEXT_INITIALIZER;
426    UErrorCode          status     = U_ZERO_ERROR;
427    UParseError         pe;
428    RegexPattern        *REPattern = NULL;
429    RegexMatcher        *REMatcher = NULL;
430    UBool               retVal     = TRUE;
431
432    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
433    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
434    if (U_FAILURE(status)) {
435        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
436            line, u_errorName(status));
437        return FALSE;
438    }
439
440    UnicodeString inputString(text, -1, US_INV);
441    UnicodeString unEscapedInput = inputString.unescape();
442    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
443    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
444
445    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
446    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
447        // UTF-8 does not allow unpaired surrogates, so this could actually happen
448        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
449        return TRUE; // not a failure of the Regex engine
450    }
451    status = U_ZERO_ERROR; // buffer overflow
452    textChars = new char[inputUTF8Length+1];
453    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
454    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
455
456    REMatcher = &REPattern->matcher(status)->reset(&inputText);
457    if (U_FAILURE(status)) {
458        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
459            line, u_errorName(status));
460        return FALSE;
461    }
462
463    UBool actualmatch;
464    actualmatch = REMatcher->lookingAt(status);
465    if (U_FAILURE(status)) {
466        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
467            line, u_errorName(status));
468        retVal =  FALSE;
469    }
470    if (actualmatch != looking) {
471        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
472        retVal = FALSE;
473    }
474
475    status = U_ZERO_ERROR;
476    actualmatch = REMatcher->matches(status);
477    if (U_FAILURE(status)) {
478        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
479            line, u_errorName(status));
480        retVal = FALSE;
481    }
482    if (actualmatch != match) {
483        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
484        retVal = FALSE;
485    }
486
487    if (retVal == FALSE) {
488        RegexPatternDump(REPattern);
489    }
490
491    delete REPattern;
492    delete REMatcher;
493    utext_close(&inputText);
494    utext_close(&pattern);
495    delete[] textChars;
496    return retVal;
497}
498
499
500
501//---------------------------------------------------------------------------
502//
503//    REGEX_ERR       Macro + invocation function to simplify writing tests
504//                       regex tests for incorrect patterns
505//
506//       usage:
507//          REGEX_ERR("pattern",   expected error line, column, expected status);
508//
509//---------------------------------------------------------------------------
510#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
511
512void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
513                          UErrorCode expectedStatus, int32_t line) {
514    UnicodeString       pattern(pat);
515
516    UErrorCode          status         = U_ZERO_ERROR;
517    UParseError         pe;
518    RegexPattern        *callerPattern = NULL;
519
520    //
521    //  Compile the caller's pattern
522    //
523    UnicodeString patString(pat);
524    callerPattern = RegexPattern::compile(patString, 0, pe, status);
525    if (status != expectedStatus) {
526        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
527    } else {
528        if (status != U_ZERO_ERROR) {
529            if (pe.line != errLine || pe.offset != errCol) {
530                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
531                    line, errLine, errCol, pe.line, pe.offset);
532            }
533        }
534    }
535
536    delete callerPattern;
537
538    //
539    //  Compile again, using a UTF-8-based UText
540    //
541    UText patternText = UTEXT_INITIALIZER;
542    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
543    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
544    if (status != expectedStatus) {
545        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
546    } else {
547        if (status != U_ZERO_ERROR) {
548            if (pe.line != errLine || pe.offset != errCol) {
549                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
550                    line, errLine, errCol, pe.line, pe.offset);
551            }
552        }
553    }
554
555    delete callerPattern;
556    utext_close(&patternText);
557}
558
559
560
561//---------------------------------------------------------------------------
562//
563//      Basic      Check for basic functionality of regex pattern matching.
564//                 Avoid the use of REGEX_FIND test macro, which has
565//                 substantial dependencies on basic Regex functionality.
566//
567//---------------------------------------------------------------------------
568void RegexTest::Basic() {
569
570
571//
572// Debug - slide failing test cases early
573//
574#if 0
575    {
576        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
577        UParseError pe;
578        UErrorCode  status = U_ZERO_ERROR;
579        RegexPattern *pattern;
580        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
581        RegexPatternDump(pattern);
582        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
583        UBool result = m->find();
584        printf("result = %d\n", result);
585        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
586        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
587    }
588    exit(1);
589#endif
590
591
592    //
593    // Pattern with parentheses
594    //
595    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
596    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
597    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
598
599    //
600    // Patterns with *
601    //
602    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
603    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
604    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
605    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
606    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
607
608    REGEX_TESTLM("a*", "",  TRUE, TRUE);
609    REGEX_TESTLM("a*", "b", TRUE, FALSE);
610
611
612    //
613    //  Patterns with "."
614    //
615    REGEX_TESTLM(".", "abc", TRUE, FALSE);
616    REGEX_TESTLM("...", "abc", TRUE, TRUE);
617    REGEX_TESTLM("....", "abc", FALSE, FALSE);
618    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
619    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
620    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
621    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
622    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
623
624    //
625    //  Patterns with * applied to chars at end of literal string
626    //
627    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
628    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
629
630    //
631    //  Supplemental chars match as single chars, not a pair of surrogates.
632    //
633    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
634    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
635    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
636
637
638    //
639    //  UnicodeSets in the pattern
640    //
641    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
642    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
643    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
644    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
645    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
646    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
647
648    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
649    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
650    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
651    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
652    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
653
654    //
655    //   OR operator in patterns
656    //
657    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
658    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
659    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
660    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
661
662    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
663    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
664    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
665    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
666    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
667    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
668
669    //
670    //  +
671    //
672    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
673    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
674    REGEX_TESTLM("b+", "", FALSE, FALSE);
675    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
676    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
677    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
678
679    //
680    //   ?
681    //
682    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
683    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
684    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
685    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
686    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
687    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
688    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
689    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
690    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
691
692    //
693    //  Escape sequences that become single literal chars, handled internally
694    //   by ICU's Unescape.
695    //
696
697    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
698    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
699    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
700    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
701    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
702    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
703    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
704    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
705    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
706    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
707
708    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
709    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
710
711    // Escape of special chars in patterns
712    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
713}
714
715
716//---------------------------------------------------------------------------
717//
718//    UTextBasic   Check for quirks that are specific to the UText
719//                 implementation.
720//
721//---------------------------------------------------------------------------
722void RegexTest::UTextBasic() {
723    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
724    UErrorCode status = U_ZERO_ERROR;
725    UText pattern = UTEXT_INITIALIZER;
726    utext_openUTF8(&pattern, str_abc, -1, &status);
727    RegexMatcher matcher(&pattern, 0, status);
728    REGEX_CHECK_STATUS;
729
730    UText input = UTEXT_INITIALIZER;
731    utext_openUTF8(&input, str_abc, -1, &status);
732    REGEX_CHECK_STATUS;
733    matcher.reset(&input);
734    REGEX_CHECK_STATUS;
735    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
736
737    matcher.reset(matcher.inputText());
738    REGEX_CHECK_STATUS;
739    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
740
741    utext_close(&pattern);
742    utext_close(&input);
743}
744
745
746//---------------------------------------------------------------------------
747//
748//      API_Match   Test that the API for class RegexMatcher
749//                  is present and nominally working, but excluding functions
750//                  implementing replace operations.
751//
752//---------------------------------------------------------------------------
753void RegexTest::API_Match() {
754    UParseError         pe;
755    UErrorCode          status=U_ZERO_ERROR;
756    int32_t             flags = 0;
757
758    //
759    // Debug - slide failing test cases early
760    //
761#if 0
762    {
763    }
764    return;
765#endif
766
767    //
768    // Simple pattern compilation
769    //
770    {
771        UnicodeString       re("abc");
772        RegexPattern        *pat2;
773        pat2 = RegexPattern::compile(re, flags, pe, status);
774        REGEX_CHECK_STATUS;
775
776        UnicodeString inStr1 = "abcdef this is a test";
777        UnicodeString instr2 = "not abc";
778        UnicodeString empty  = "";
779
780
781        //
782        // Matcher creation and reset.
783        //
784        RegexMatcher *m1 = pat2->matcher(inStr1, status);
785        REGEX_CHECK_STATUS;
786        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
787        REGEX_ASSERT(m1->input() == inStr1);
788        m1->reset(instr2);
789        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790        REGEX_ASSERT(m1->input() == instr2);
791        m1->reset(inStr1);
792        REGEX_ASSERT(m1->input() == inStr1);
793        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
794        m1->reset(empty);
795        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796        REGEX_ASSERT(m1->input() == empty);
797        REGEX_ASSERT(&m1->pattern() == pat2);
798
799        //
800        //  reset(pos, status)
801        //
802        m1->reset(inStr1);
803        m1->reset(4, status);
804        REGEX_CHECK_STATUS;
805        REGEX_ASSERT(m1->input() == inStr1);
806        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
807
808        m1->reset(-1, status);
809        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
810        status = U_ZERO_ERROR;
811
812        m1->reset(0, status);
813        REGEX_CHECK_STATUS;
814        status = U_ZERO_ERROR;
815
816        int32_t len = m1->input().length();
817        m1->reset(len-1, status);
818        REGEX_CHECK_STATUS;
819        status = U_ZERO_ERROR;
820
821        m1->reset(len, status);
822        REGEX_CHECK_STATUS;
823        status = U_ZERO_ERROR;
824
825        m1->reset(len+1, status);
826        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
827        status = U_ZERO_ERROR;
828
829        //
830        // match(pos, status)
831        //
832        m1->reset(instr2);
833        REGEX_ASSERT(m1->matches(4, status) == TRUE);
834        m1->reset();
835        REGEX_ASSERT(m1->matches(3, status) == FALSE);
836        m1->reset();
837        REGEX_ASSERT(m1->matches(5, status) == FALSE);
838        REGEX_ASSERT(m1->matches(4, status) == TRUE);
839        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
840        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
841
842        // Match() at end of string should fail, but should not
843        //  be an error.
844        status = U_ZERO_ERROR;
845        len = m1->input().length();
846        REGEX_ASSERT(m1->matches(len, status) == FALSE);
847        REGEX_CHECK_STATUS;
848
849        // Match beyond end of string should fail with an error.
850        status = U_ZERO_ERROR;
851        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
852        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
853
854        // Successful match at end of string.
855        {
856            status = U_ZERO_ERROR;
857            RegexMatcher m("A?", 0, status);  // will match zero length string.
858            REGEX_CHECK_STATUS;
859            m.reset(inStr1);
860            len = inStr1.length();
861            REGEX_ASSERT(m.matches(len, status) == TRUE);
862            REGEX_CHECK_STATUS;
863            m.reset(empty);
864            REGEX_ASSERT(m.matches(0, status) == TRUE);
865            REGEX_CHECK_STATUS;
866        }
867
868
869        //
870        // lookingAt(pos, status)
871        //
872        status = U_ZERO_ERROR;
873        m1->reset(instr2);  // "not abc"
874        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
875        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
876        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
877        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
878        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
879        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880        status = U_ZERO_ERROR;
881        len = m1->input().length();
882        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
883        REGEX_CHECK_STATUS;
884        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
885        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886
887        delete m1;
888        delete pat2;
889    }
890
891
892    //
893    // Capture Group.
894    //     RegexMatcher::start();
895    //     RegexMatcher::end();
896    //     RegexMatcher::groupCount();
897    //
898    {
899        int32_t             flags=0;
900        UParseError         pe;
901        UErrorCode          status=U_ZERO_ERROR;
902
903        UnicodeString       re("01(23(45)67)(.*)");
904        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
905        REGEX_CHECK_STATUS;
906        UnicodeString data = "0123456789";
907
908        RegexMatcher *matcher = pat->matcher(data, status);
909        REGEX_CHECK_STATUS;
910        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
911        static const int32_t matchStarts[] = {0,  2, 4, 8};
912        static const int32_t matchEnds[]   = {10, 8, 6, 10};
913        int32_t i;
914        for (i=0; i<4; i++) {
915            int32_t actualStart = matcher->start(i, status);
916            REGEX_CHECK_STATUS;
917            if (actualStart != matchStarts[i]) {
918                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
919                    __LINE__, i, matchStarts[i], actualStart);
920            }
921            int32_t actualEnd = matcher->end(i, status);
922            REGEX_CHECK_STATUS;
923            if (actualEnd != matchEnds[i]) {
924                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
925                    __LINE__, i, matchEnds[i], actualEnd);
926            }
927        }
928
929        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
930        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
931
932        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
933        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
934        matcher->reset();
935        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
936
937        matcher->lookingAt(status);
938        REGEX_ASSERT(matcher->group(status)    == "0123456789");
939        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
940        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
941        REGEX_ASSERT(matcher->group(2, status) == "45"        );
942        REGEX_ASSERT(matcher->group(3, status) == "89"        );
943        REGEX_CHECK_STATUS;
944        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
945        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
946        matcher->reset();
947        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
948
949        delete matcher;
950        delete pat;
951
952    }
953
954    //
955    //  find
956    //
957    {
958        int32_t             flags=0;
959        UParseError         pe;
960        UErrorCode          status=U_ZERO_ERROR;
961
962        UnicodeString       re("abc");
963        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
964        REGEX_CHECK_STATUS;
965        UnicodeString data = ".abc..abc...abc..";
966        //                    012345678901234567
967
968        RegexMatcher *matcher = pat->matcher(data, status);
969        REGEX_CHECK_STATUS;
970        REGEX_ASSERT(matcher->find());
971        REGEX_ASSERT(matcher->start(status) == 1);
972        REGEX_ASSERT(matcher->find());
973        REGEX_ASSERT(matcher->start(status) == 6);
974        REGEX_ASSERT(matcher->find());
975        REGEX_ASSERT(matcher->start(status) == 12);
976        REGEX_ASSERT(matcher->find() == FALSE);
977        REGEX_ASSERT(matcher->find() == FALSE);
978
979        matcher->reset();
980        REGEX_ASSERT(matcher->find());
981        REGEX_ASSERT(matcher->start(status) == 1);
982
983        REGEX_ASSERT(matcher->find(0, status));
984        REGEX_ASSERT(matcher->start(status) == 1);
985        REGEX_ASSERT(matcher->find(1, status));
986        REGEX_ASSERT(matcher->start(status) == 1);
987        REGEX_ASSERT(matcher->find(2, status));
988        REGEX_ASSERT(matcher->start(status) == 6);
989        REGEX_ASSERT(matcher->find(12, status));
990        REGEX_ASSERT(matcher->start(status) == 12);
991        REGEX_ASSERT(matcher->find(13, status) == FALSE);
992        REGEX_ASSERT(matcher->find(16, status) == FALSE);
993        REGEX_ASSERT(matcher->find(17, status) == FALSE);
994        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
995
996        status = U_ZERO_ERROR;
997        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
998        status = U_ZERO_ERROR;
999        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1000
1001        REGEX_ASSERT(matcher->groupCount() == 0);
1002
1003        delete matcher;
1004        delete pat;
1005    }
1006
1007
1008    //
1009    //  find, with \G in pattern (true if at the end of a previous match).
1010    //
1011    {
1012        int32_t             flags=0;
1013        UParseError         pe;
1014        UErrorCode          status=U_ZERO_ERROR;
1015
1016        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1017        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1018        REGEX_CHECK_STATUS;
1019        UnicodeString data = ".abcabc.abc..";
1020        //                    012345678901234567
1021
1022        RegexMatcher *matcher = pat->matcher(data, status);
1023        REGEX_CHECK_STATUS;
1024        REGEX_ASSERT(matcher->find());
1025        REGEX_ASSERT(matcher->start(status) == 0);
1026        REGEX_ASSERT(matcher->start(1, status) == -1);
1027        REGEX_ASSERT(matcher->start(2, status) == 1);
1028
1029        REGEX_ASSERT(matcher->find());
1030        REGEX_ASSERT(matcher->start(status) == 4);
1031        REGEX_ASSERT(matcher->start(1, status) == 4);
1032        REGEX_ASSERT(matcher->start(2, status) == -1);
1033        REGEX_CHECK_STATUS;
1034
1035        delete matcher;
1036        delete pat;
1037    }
1038
1039    //
1040    //   find with zero length matches, match position should bump ahead
1041    //     to prevent loops.
1042    //
1043    {
1044        int32_t                 i;
1045        UErrorCode          status=U_ZERO_ERROR;
1046        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1047                                                      //   using an always-true look-ahead.
1048        REGEX_CHECK_STATUS;
1049        UnicodeString s("    ");
1050        m.reset(s);
1051        for (i=0; ; i++) {
1052            if (m.find() == FALSE) {
1053                break;
1054            }
1055            REGEX_ASSERT(m.start(status) == i);
1056            REGEX_ASSERT(m.end(status) == i);
1057        }
1058        REGEX_ASSERT(i==5);
1059
1060        // Check that the bump goes over surrogate pairs OK
1061        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1062        s = s.unescape();
1063        m.reset(s);
1064        for (i=0; ; i+=2) {
1065            if (m.find() == FALSE) {
1066                break;
1067            }
1068            REGEX_ASSERT(m.start(status) == i);
1069            REGEX_ASSERT(m.end(status) == i);
1070        }
1071        REGEX_ASSERT(i==10);
1072    }
1073    {
1074        // find() loop breaking test.
1075        //        with pattern of /.?/, should see a series of one char matches, then a single
1076        //        match of zero length at the end of the input string.
1077        int32_t                 i;
1078        UErrorCode          status=U_ZERO_ERROR;
1079        RegexMatcher        m(".?", 0, status);
1080        REGEX_CHECK_STATUS;
1081        UnicodeString s("    ");
1082        m.reset(s);
1083        for (i=0; ; i++) {
1084            if (m.find() == FALSE) {
1085                break;
1086            }
1087            REGEX_ASSERT(m.start(status) == i);
1088            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1089        }
1090        REGEX_ASSERT(i==5);
1091    }
1092
1093
1094    //
1095    // Matchers with no input string behave as if they had an empty input string.
1096    //
1097
1098    {
1099        UErrorCode status = U_ZERO_ERROR;
1100        RegexMatcher  m(".?", 0, status);
1101        REGEX_CHECK_STATUS;
1102        REGEX_ASSERT(m.find());
1103        REGEX_ASSERT(m.start(status) == 0);
1104        REGEX_ASSERT(m.input() == "");
1105    }
1106    {
1107        UErrorCode status = U_ZERO_ERROR;
1108        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1109        RegexMatcher  *m = p->matcher(status);
1110        REGEX_CHECK_STATUS;
1111
1112        REGEX_ASSERT(m->find() == FALSE);
1113        REGEX_ASSERT(m->input() == "");
1114        delete m;
1115        delete p;
1116    }
1117
1118    //
1119    // Regions
1120    //
1121    {
1122        UErrorCode status = U_ZERO_ERROR;
1123        UnicodeString testString("This is test data");
1124        RegexMatcher m(".*", testString,  0, status);
1125        REGEX_CHECK_STATUS;
1126        REGEX_ASSERT(m.regionStart() == 0);
1127        REGEX_ASSERT(m.regionEnd() == testString.length());
1128        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130
1131        m.region(2,4, status);
1132        REGEX_CHECK_STATUS;
1133        REGEX_ASSERT(m.matches(status));
1134        REGEX_ASSERT(m.start(status)==2);
1135        REGEX_ASSERT(m.end(status)==4);
1136        REGEX_CHECK_STATUS;
1137
1138        m.reset();
1139        REGEX_ASSERT(m.regionStart() == 0);
1140        REGEX_ASSERT(m.regionEnd() == testString.length());
1141
1142        UnicodeString shorterString("short");
1143        m.reset(shorterString);
1144        REGEX_ASSERT(m.regionStart() == 0);
1145        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1146
1147        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1148        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1149        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1150        REGEX_ASSERT(&m == &m.reset());
1151        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1152
1153        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1154        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1155        REGEX_ASSERT(&m == &m.reset());
1156        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1157
1158        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1159        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1160        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1161        REGEX_ASSERT(&m == &m.reset());
1162        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1163
1164        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1165        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1166        REGEX_ASSERT(&m == &m.reset());
1167        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1168
1169    }
1170
1171    //
1172    // hitEnd() and requireEnd()
1173    //
1174    {
1175        UErrorCode status = U_ZERO_ERROR;
1176        UnicodeString testString("aabb");
1177        RegexMatcher m1(".*", testString,  0, status);
1178        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1179        REGEX_ASSERT(m1.hitEnd() == TRUE);
1180        REGEX_ASSERT(m1.requireEnd() == FALSE);
1181        REGEX_CHECK_STATUS;
1182
1183        status = U_ZERO_ERROR;
1184        RegexMatcher m2("a*", testString, 0, status);
1185        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1186        REGEX_ASSERT(m2.hitEnd() == FALSE);
1187        REGEX_ASSERT(m2.requireEnd() == FALSE);
1188        REGEX_CHECK_STATUS;
1189
1190        status = U_ZERO_ERROR;
1191        RegexMatcher m3(".*$", testString, 0, status);
1192        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1193        REGEX_ASSERT(m3.hitEnd() == TRUE);
1194        REGEX_ASSERT(m3.requireEnd() == TRUE);
1195        REGEX_CHECK_STATUS;
1196    }
1197
1198
1199    //
1200    // Compilation error on reset with UChar *
1201    //   These were a hazard that people were stumbling over with runtime errors.
1202    //   Changed them to compiler errors by adding private methods that more closely
1203    //   matched the incorrect use of the functions.
1204    //
1205#if 0
1206    {
1207        UErrorCode status = U_ZERO_ERROR;
1208        UChar ucharString[20];
1209        RegexMatcher m(".", 0, status);
1210        m.reset(ucharString);  // should not compile.
1211
1212        RegexPattern *p = RegexPattern::compile(".", 0, status);
1213        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1214
1215        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1216    }
1217#endif
1218
1219    //
1220    //  Time Outs.
1221    //       Note:  These tests will need to be changed when the regexp engine is
1222    //              able to detect and cut short the exponential time behavior on
1223    //              this type of match.
1224    //
1225    {
1226        UErrorCode status = U_ZERO_ERROR;
1227        //    Enough 'a's in the string to cause the match to time out.
1228        //       (Each on additonal 'a' doubles the time)
1229        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1230        RegexMatcher matcher("(a+)+b", testString, 0, status);
1231        REGEX_CHECK_STATUS;
1232        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1233        matcher.setTimeLimit(100, status);
1234        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1235        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1237    }
1238    {
1239        UErrorCode status = U_ZERO_ERROR;
1240        //   Few enough 'a's to slip in under the time limit.
1241        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1242        RegexMatcher matcher("(a+)+b", testString, 0, status);
1243        REGEX_CHECK_STATUS;
1244        matcher.setTimeLimit(100, status);
1245        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246        REGEX_CHECK_STATUS;
1247    }
1248
1249    //
1250    //  Stack Limits
1251    //
1252    {
1253        UErrorCode status = U_ZERO_ERROR;
1254        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1255
1256        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1257        //   of the '+', and makes the stack frames larger.
1258        RegexMatcher matcher("(A)+A$", testString, 0, status);
1259
1260        // With the default stack, this match should fail to run
1261        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1262        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1263
1264        // With unlimited stack, it should run
1265        status = U_ZERO_ERROR;
1266        matcher.setStackLimit(0, status);
1267        REGEX_CHECK_STATUS;
1268        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1269        REGEX_CHECK_STATUS;
1270        REGEX_ASSERT(matcher.getStackLimit() == 0);
1271
1272        // With a limited stack, it the match should fail
1273        status = U_ZERO_ERROR;
1274        matcher.setStackLimit(10000, status);
1275        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1276        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1277        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1278    }
1279
1280        // A pattern that doesn't save state should work with
1281        //   a minimal sized stack
1282    {
1283        UErrorCode status = U_ZERO_ERROR;
1284        UnicodeString testString = "abc";
1285        RegexMatcher matcher("abc", testString, 0, status);
1286        REGEX_CHECK_STATUS;
1287        matcher.setStackLimit(30, status);
1288        REGEX_CHECK_STATUS;
1289        REGEX_ASSERT(matcher.matches(status) == TRUE);
1290        REGEX_CHECK_STATUS;
1291        REGEX_ASSERT(matcher.getStackLimit() == 30);
1292
1293        // Negative stack sizes should fail
1294        status = U_ZERO_ERROR;
1295        matcher.setStackLimit(1000, status);
1296        REGEX_CHECK_STATUS;
1297        matcher.setStackLimit(-1, status);
1298        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1299        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1300    }
1301
1302
1303}
1304
1305
1306
1307
1308
1309
1310//---------------------------------------------------------------------------
1311//
1312//      API_Replace        API test for class RegexMatcher, testing the
1313//                         Replace family of functions.
1314//
1315//---------------------------------------------------------------------------
1316void RegexTest::API_Replace() {
1317    //
1318    //  Replace
1319    //
1320    int32_t             flags=0;
1321    UParseError         pe;
1322    UErrorCode          status=U_ZERO_ERROR;
1323
1324    UnicodeString       re("abc");
1325    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1326    REGEX_CHECK_STATUS;
1327    UnicodeString data = ".abc..abc...abc..";
1328    //                    012345678901234567
1329    RegexMatcher *matcher = pat->matcher(data, status);
1330
1331    //
1332    //  Plain vanilla matches.
1333    //
1334    UnicodeString  dest;
1335    dest = matcher->replaceFirst("yz", status);
1336    REGEX_CHECK_STATUS;
1337    REGEX_ASSERT(dest == ".yz..abc...abc..");
1338
1339    dest = matcher->replaceAll("yz", status);
1340    REGEX_CHECK_STATUS;
1341    REGEX_ASSERT(dest == ".yz..yz...yz..");
1342
1343    //
1344    //  Plain vanilla non-matches.
1345    //
1346    UnicodeString d2 = ".abx..abx...abx..";
1347    matcher->reset(d2);
1348    dest = matcher->replaceFirst("yz", status);
1349    REGEX_CHECK_STATUS;
1350    REGEX_ASSERT(dest == ".abx..abx...abx..");
1351
1352    dest = matcher->replaceAll("yz", status);
1353    REGEX_CHECK_STATUS;
1354    REGEX_ASSERT(dest == ".abx..abx...abx..");
1355
1356    //
1357    // Empty source string
1358    //
1359    UnicodeString d3 = "";
1360    matcher->reset(d3);
1361    dest = matcher->replaceFirst("yz", status);
1362    REGEX_CHECK_STATUS;
1363    REGEX_ASSERT(dest == "");
1364
1365    dest = matcher->replaceAll("yz", status);
1366    REGEX_CHECK_STATUS;
1367    REGEX_ASSERT(dest == "");
1368
1369    //
1370    // Empty substitution string
1371    //
1372    matcher->reset(data);              // ".abc..abc...abc.."
1373    dest = matcher->replaceFirst("", status);
1374    REGEX_CHECK_STATUS;
1375    REGEX_ASSERT(dest == "...abc...abc..");
1376
1377    dest = matcher->replaceAll("", status);
1378    REGEX_CHECK_STATUS;
1379    REGEX_ASSERT(dest == "........");
1380
1381    //
1382    // match whole string
1383    //
1384    UnicodeString d4 = "abc";
1385    matcher->reset(d4);
1386    dest = matcher->replaceFirst("xyz", status);
1387    REGEX_CHECK_STATUS;
1388    REGEX_ASSERT(dest == "xyz");
1389
1390    dest = matcher->replaceAll("xyz", status);
1391    REGEX_CHECK_STATUS;
1392    REGEX_ASSERT(dest == "xyz");
1393
1394    //
1395    // Capture Group, simple case
1396    //
1397    UnicodeString       re2("a(..)");
1398    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1399    REGEX_CHECK_STATUS;
1400    UnicodeString d5 = "abcdefg";
1401    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1402    REGEX_CHECK_STATUS;
1403    dest = matcher2->replaceFirst("$1$1", status);
1404    REGEX_CHECK_STATUS;
1405    REGEX_ASSERT(dest == "bcbcdefg");
1406
1407    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1408    REGEX_CHECK_STATUS;
1409    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1410
1411    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1412    REGEX_CHECK_STATUS;
1413    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1414
1415    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1416    replacement = replacement.unescape();
1417    dest = matcher2->replaceFirst(replacement, status);
1418    REGEX_CHECK_STATUS;
1419    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1420
1421    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1422
1423
1424    //
1425    // Replacement String with \u hex escapes
1426    //
1427    {
1428        UnicodeString  src = "abc 1 abc 2 abc 3";
1429        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1430        matcher->reset(src);
1431        UnicodeString  result = matcher->replaceAll(substitute, status);
1432        REGEX_CHECK_STATUS;
1433        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1434    }
1435    {
1436        UnicodeString  src = "abc !";
1437        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1438        matcher->reset(src);
1439        UnicodeString  result = matcher->replaceAll(substitute, status);
1440        REGEX_CHECK_STATUS;
1441        UnicodeString expected = UnicodeString("--");
1442        expected.append((UChar32)0x10000);
1443        expected.append("-- !");
1444        REGEX_ASSERT(result == expected);
1445    }
1446    // TODO:  need more through testing of capture substitutions.
1447
1448    // Bug 4057
1449    //
1450    {
1451        status = U_ZERO_ERROR;
1452        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1453        RegexMatcher m("ss(.*?)ee", 0, status);
1454        REGEX_CHECK_STATUS;
1455        UnicodeString result;
1456
1457        // Multiple finds do NOT bump up the previous appendReplacement postion.
1458        m.reset(s);
1459        m.find();
1460        m.find();
1461        m.appendReplacement(result, "ooh", status);
1462        REGEX_CHECK_STATUS;
1463        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1464
1465        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1466        status = U_ZERO_ERROR;
1467        result.truncate(0);
1468        m.reset(10, status);
1469        m.find();
1470        m.find();
1471        m.appendReplacement(result, "ooh", status);
1472        REGEX_CHECK_STATUS;
1473        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1474
1475        // find() at interior of string, appendReplacemnt still starts at beginning.
1476        status = U_ZERO_ERROR;
1477        result.truncate(0);
1478        m.reset();
1479        m.find(10, status);
1480        m.find();
1481        m.appendReplacement(result, "ooh", status);
1482        REGEX_CHECK_STATUS;
1483        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1484
1485        m.appendTail(result);
1486        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1487
1488    }
1489
1490    delete matcher2;
1491    delete pat2;
1492    delete matcher;
1493    delete pat;
1494}
1495
1496
1497//---------------------------------------------------------------------------
1498//
1499//      API_Pattern       Test that the API for class RegexPattern is
1500//                        present and nominally working.
1501//
1502//---------------------------------------------------------------------------
1503void RegexTest::API_Pattern() {
1504    RegexPattern        pata;    // Test default constructor to not crash.
1505    RegexPattern        patb;
1506
1507    REGEX_ASSERT(pata == patb);
1508    REGEX_ASSERT(pata == pata);
1509
1510    UnicodeString re1("abc[a-l][m-z]");
1511    UnicodeString re2("def");
1512    UErrorCode    status = U_ZERO_ERROR;
1513    UParseError   pe;
1514
1515    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1516    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1517    REGEX_CHECK_STATUS;
1518    REGEX_ASSERT(*pat1 == *pat1);
1519    REGEX_ASSERT(*pat1 != pata);
1520
1521    // Assign
1522    patb = *pat1;
1523    REGEX_ASSERT(patb == *pat1);
1524
1525    // Copy Construct
1526    RegexPattern patc(*pat1);
1527    REGEX_ASSERT(patc == *pat1);
1528    REGEX_ASSERT(patb == patc);
1529    REGEX_ASSERT(pat1 != pat2);
1530    patb = *pat2;
1531    REGEX_ASSERT(patb != patc);
1532    REGEX_ASSERT(patb == *pat2);
1533
1534    // Compile with no flags.
1535    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1536    REGEX_ASSERT(*pat1a == *pat1);
1537
1538    REGEX_ASSERT(pat1a->flags() == 0);
1539
1540    // Compile with different flags should be not equal
1541    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1542    REGEX_CHECK_STATUS;
1543
1544    REGEX_ASSERT(*pat1b != *pat1a);
1545    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1546    REGEX_ASSERT(pat1a->flags() == 0);
1547    delete pat1b;
1548
1549    // clone
1550    RegexPattern *pat1c = pat1->clone();
1551    REGEX_ASSERT(*pat1c == *pat1);
1552    REGEX_ASSERT(*pat1c != *pat2);
1553
1554    delete pat1c;
1555    delete pat1a;
1556    delete pat1;
1557    delete pat2;
1558
1559
1560    //
1561    //   Verify that a matcher created from a cloned pattern works.
1562    //     (Jitterbug 3423)
1563    //
1564    {
1565        UErrorCode     status     = U_ZERO_ERROR;
1566        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1567        RegexPattern  *pClone     = pSource->clone();
1568        delete         pSource;
1569        RegexMatcher  *mFromClone = pClone->matcher(status);
1570        REGEX_CHECK_STATUS;
1571        UnicodeString s = "Hello World";
1572        mFromClone->reset(s);
1573        REGEX_ASSERT(mFromClone->find() == TRUE);
1574        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1575        REGEX_ASSERT(mFromClone->find() == TRUE);
1576        REGEX_ASSERT(mFromClone->group(status) == "World");
1577        REGEX_ASSERT(mFromClone->find() == FALSE);
1578        delete mFromClone;
1579        delete pClone;
1580    }
1581
1582    //
1583    //   matches convenience API
1584    //
1585    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1586    REGEX_CHECK_STATUS;
1587    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1588    REGEX_CHECK_STATUS;
1589    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1590    REGEX_CHECK_STATUS;
1591    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1592    REGEX_CHECK_STATUS;
1593    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1594    REGEX_CHECK_STATUS;
1595    status = U_INDEX_OUTOFBOUNDS_ERROR;
1596    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1597    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1598
1599
1600    //
1601    // Split()
1602    //
1603    status = U_ZERO_ERROR;
1604    pat1 = RegexPattern::compile(" +",  pe, status);
1605    REGEX_CHECK_STATUS;
1606    UnicodeString  fields[10];
1607
1608    int32_t n;
1609    n = pat1->split("Now is the time", fields, 10, status);
1610    REGEX_CHECK_STATUS;
1611    REGEX_ASSERT(n==4);
1612    REGEX_ASSERT(fields[0]=="Now");
1613    REGEX_ASSERT(fields[1]=="is");
1614    REGEX_ASSERT(fields[2]=="the");
1615    REGEX_ASSERT(fields[3]=="time");
1616    REGEX_ASSERT(fields[4]=="");
1617
1618    n = pat1->split("Now is the time", fields, 2, status);
1619    REGEX_CHECK_STATUS;
1620    REGEX_ASSERT(n==2);
1621    REGEX_ASSERT(fields[0]=="Now");
1622    REGEX_ASSERT(fields[1]=="is the time");
1623    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1624
1625    fields[1] = "*";
1626    status = U_ZERO_ERROR;
1627    n = pat1->split("Now is the time", fields, 1, status);
1628    REGEX_CHECK_STATUS;
1629    REGEX_ASSERT(n==1);
1630    REGEX_ASSERT(fields[0]=="Now is the time");
1631    REGEX_ASSERT(fields[1]=="*");
1632    status = U_ZERO_ERROR;
1633
1634    n = pat1->split("    Now       is the time   ", fields, 10, status);
1635    REGEX_CHECK_STATUS;
1636    REGEX_ASSERT(n==6);
1637    REGEX_ASSERT(fields[0]=="");
1638    REGEX_ASSERT(fields[1]=="Now");
1639    REGEX_ASSERT(fields[2]=="is");
1640    REGEX_ASSERT(fields[3]=="the");
1641    REGEX_ASSERT(fields[4]=="time");
1642    REGEX_ASSERT(fields[5]=="");
1643
1644    n = pat1->split("     ", fields, 10, status);
1645    REGEX_CHECK_STATUS;
1646    REGEX_ASSERT(n==2);
1647    REGEX_ASSERT(fields[0]=="");
1648    REGEX_ASSERT(fields[1]=="");
1649
1650    fields[0] = "foo";
1651    n = pat1->split("", fields, 10, status);
1652    REGEX_CHECK_STATUS;
1653    REGEX_ASSERT(n==0);
1654    REGEX_ASSERT(fields[0]=="foo");
1655
1656    delete pat1;
1657
1658    //  split, with a pattern with (capture)
1659    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1660    REGEX_CHECK_STATUS;
1661
1662    status = U_ZERO_ERROR;
1663    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1664    REGEX_CHECK_STATUS;
1665    REGEX_ASSERT(n==7);
1666    REGEX_ASSERT(fields[0]=="");
1667    REGEX_ASSERT(fields[1]=="a");
1668    REGEX_ASSERT(fields[2]=="Now is ");
1669    REGEX_ASSERT(fields[3]=="b");
1670    REGEX_ASSERT(fields[4]=="the time");
1671    REGEX_ASSERT(fields[5]=="c");
1672    REGEX_ASSERT(fields[6]=="");
1673    REGEX_ASSERT(status==U_ZERO_ERROR);
1674
1675    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1676    REGEX_CHECK_STATUS;
1677    REGEX_ASSERT(n==7);
1678    REGEX_ASSERT(fields[0]=="  ");
1679    REGEX_ASSERT(fields[1]=="a");
1680    REGEX_ASSERT(fields[2]=="Now is ");
1681    REGEX_ASSERT(fields[3]=="b");
1682    REGEX_ASSERT(fields[4]=="the time");
1683    REGEX_ASSERT(fields[5]=="c");
1684    REGEX_ASSERT(fields[6]=="");
1685
1686    status = U_ZERO_ERROR;
1687    fields[6] = "foo";
1688    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1689    REGEX_CHECK_STATUS;
1690    REGEX_ASSERT(n==6);
1691    REGEX_ASSERT(fields[0]=="  ");
1692    REGEX_ASSERT(fields[1]=="a");
1693    REGEX_ASSERT(fields[2]=="Now is ");
1694    REGEX_ASSERT(fields[3]=="b");
1695    REGEX_ASSERT(fields[4]=="the time");
1696    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1697    REGEX_ASSERT(fields[6]=="foo");
1698
1699    status = U_ZERO_ERROR;
1700    fields[5] = "foo";
1701    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1702    REGEX_CHECK_STATUS;
1703    REGEX_ASSERT(n==5);
1704    REGEX_ASSERT(fields[0]=="  ");
1705    REGEX_ASSERT(fields[1]=="a");
1706    REGEX_ASSERT(fields[2]=="Now is ");
1707    REGEX_ASSERT(fields[3]=="b");
1708    REGEX_ASSERT(fields[4]=="the time<c>");
1709    REGEX_ASSERT(fields[5]=="foo");
1710
1711    status = U_ZERO_ERROR;
1712    fields[5] = "foo";
1713    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1714    REGEX_CHECK_STATUS;
1715    REGEX_ASSERT(n==5);
1716    REGEX_ASSERT(fields[0]=="  ");
1717    REGEX_ASSERT(fields[1]=="a");
1718    REGEX_ASSERT(fields[2]=="Now is ");
1719    REGEX_ASSERT(fields[3]=="b");
1720    REGEX_ASSERT(fields[4]=="the time");
1721    REGEX_ASSERT(fields[5]=="foo");
1722
1723    status = U_ZERO_ERROR;
1724    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1725    REGEX_CHECK_STATUS;
1726    REGEX_ASSERT(n==4);
1727    REGEX_ASSERT(fields[0]=="  ");
1728    REGEX_ASSERT(fields[1]=="a");
1729    REGEX_ASSERT(fields[2]=="Now is ");
1730    REGEX_ASSERT(fields[3]=="the time<c>");
1731    status = U_ZERO_ERROR;
1732    delete pat1;
1733
1734    pat1 = RegexPattern::compile("([-,])",  pe, status);
1735    REGEX_CHECK_STATUS;
1736    n = pat1->split("1-10,20", fields, 10, status);
1737    REGEX_CHECK_STATUS;
1738    REGEX_ASSERT(n==5);
1739    REGEX_ASSERT(fields[0]=="1");
1740    REGEX_ASSERT(fields[1]=="-");
1741    REGEX_ASSERT(fields[2]=="10");
1742    REGEX_ASSERT(fields[3]==",");
1743    REGEX_ASSERT(fields[4]=="20");
1744    delete pat1;
1745
1746    // Test split of string with empty trailing fields
1747    pat1 = RegexPattern::compile(",", pe, status);
1748    REGEX_CHECK_STATUS;
1749    n = pat1->split("a,b,c,", fields, 10, status);
1750    REGEX_CHECK_STATUS;
1751    REGEX_ASSERT(n==4);
1752    REGEX_ASSERT(fields[0]=="a");
1753    REGEX_ASSERT(fields[1]=="b");
1754    REGEX_ASSERT(fields[2]=="c");
1755    REGEX_ASSERT(fields[3]=="");
1756
1757    n = pat1->split("a,,,", fields, 10, status);
1758    REGEX_CHECK_STATUS;
1759    REGEX_ASSERT(n==4);
1760    REGEX_ASSERT(fields[0]=="a");
1761    REGEX_ASSERT(fields[1]=="");
1762    REGEX_ASSERT(fields[2]=="");
1763    REGEX_ASSERT(fields[3]=="");
1764    delete pat1;
1765
1766    // Split Separator with zero length match.
1767    pat1 = RegexPattern::compile(":?", pe, status);
1768    REGEX_CHECK_STATUS;
1769    n = pat1->split("abc", fields, 10, status);
1770    REGEX_CHECK_STATUS;
1771    REGEX_ASSERT(n==5);
1772    REGEX_ASSERT(fields[0]=="");
1773    REGEX_ASSERT(fields[1]=="a");
1774    REGEX_ASSERT(fields[2]=="b");
1775    REGEX_ASSERT(fields[3]=="c");
1776    REGEX_ASSERT(fields[4]=="");
1777
1778    delete pat1;
1779
1780    //
1781    // RegexPattern::pattern()
1782    //
1783    pat1 = new RegexPattern();
1784    REGEX_ASSERT(pat1->pattern() == "");
1785    delete pat1;
1786
1787    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1788    REGEX_CHECK_STATUS;
1789    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1790    delete pat1;
1791
1792
1793    //
1794    // classID functions
1795    //
1796    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1797    REGEX_CHECK_STATUS;
1798    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1799    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1800    UnicodeString Hello("Hello, world.");
1801    RegexMatcher *m = pat1->matcher(Hello, status);
1802    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1803    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1804    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1805    delete m;
1806    delete pat1;
1807
1808}
1809
1810//---------------------------------------------------------------------------
1811//
1812//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1813//                       is present and working, but excluding functions
1814//                       implementing replace operations.
1815//
1816//---------------------------------------------------------------------------
1817void RegexTest::API_Match_UTF8() {
1818    UParseError         pe;
1819    UErrorCode          status=U_ZERO_ERROR;
1820    int32_t             flags = 0;
1821
1822    //
1823    // Debug - slide failing test cases early
1824    //
1825#if 0
1826    {
1827    }
1828    return;
1829#endif
1830
1831    //
1832    // Simple pattern compilation
1833    //
1834    {
1835        UText               re = UTEXT_INITIALIZER;
1836        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1837        REGEX_VERBOSE_TEXT(&re);
1838        RegexPattern        *pat2;
1839        pat2 = RegexPattern::compile(&re, flags, pe, status);
1840        REGEX_CHECK_STATUS;
1841
1842        UText input1 = UTEXT_INITIALIZER;
1843        UText input2 = UTEXT_INITIALIZER;
1844        UText empty  = UTEXT_INITIALIZER;
1845        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1846        REGEX_VERBOSE_TEXT(&input1);
1847        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1848        REGEX_VERBOSE_TEXT(&input2);
1849        utext_openUChars(&empty, NULL, 0, &status);
1850
1851        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1852        int32_t input2Len = strlen("not abc");
1853
1854
1855        //
1856        // Matcher creation and reset.
1857        //
1858        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1859        REGEX_CHECK_STATUS;
1860        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1861        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1862        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863        m1->reset(&input2);
1864        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1865        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1866        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1867        m1->reset(&input1);
1868        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1870        m1->reset(&empty);
1871        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1872        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1873
1874        //
1875        //  reset(pos, status)
1876        //
1877        m1->reset(&input1);
1878        m1->reset(4, status);
1879        REGEX_CHECK_STATUS;
1880        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1881        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1882
1883        m1->reset(-1, status);
1884        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885        status = U_ZERO_ERROR;
1886
1887        m1->reset(0, status);
1888        REGEX_CHECK_STATUS;
1889        status = U_ZERO_ERROR;
1890
1891        m1->reset(input1Len-1, status);
1892        REGEX_CHECK_STATUS;
1893        status = U_ZERO_ERROR;
1894
1895        m1->reset(input1Len, status);
1896        REGEX_CHECK_STATUS;
1897        status = U_ZERO_ERROR;
1898
1899        m1->reset(input1Len+1, status);
1900        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1901        status = U_ZERO_ERROR;
1902
1903        //
1904        // match(pos, status)
1905        //
1906        m1->reset(&input2);
1907        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1908        m1->reset();
1909        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1910        m1->reset();
1911        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1912        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1914        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1915
1916        // Match() at end of string should fail, but should not
1917        //  be an error.
1918        status = U_ZERO_ERROR;
1919        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1920        REGEX_CHECK_STATUS;
1921
1922        // Match beyond end of string should fail with an error.
1923        status = U_ZERO_ERROR;
1924        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1925        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926
1927        // Successful match at end of string.
1928        {
1929            status = U_ZERO_ERROR;
1930            RegexMatcher m("A?", 0, status);  // will match zero length string.
1931            REGEX_CHECK_STATUS;
1932            m.reset(&input1);
1933            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1934            REGEX_CHECK_STATUS;
1935            m.reset(&empty);
1936            REGEX_ASSERT(m.matches(0, status) == TRUE);
1937            REGEX_CHECK_STATUS;
1938        }
1939
1940
1941        //
1942        // lookingAt(pos, status)
1943        //
1944        status = U_ZERO_ERROR;
1945        m1->reset(&input2);  // "not abc"
1946        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1947        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1948        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1949        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1950        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1951        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1952        status = U_ZERO_ERROR;
1953        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1954        REGEX_CHECK_STATUS;
1955        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1956        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957
1958        delete m1;
1959        delete pat2;
1960
1961        utext_close(&re);
1962        utext_close(&input1);
1963        utext_close(&input2);
1964        utext_close(&empty);
1965    }
1966
1967
1968    //
1969    // Capture Group.
1970    //     RegexMatcher::start();
1971    //     RegexMatcher::end();
1972    //     RegexMatcher::groupCount();
1973    //
1974    {
1975        int32_t             flags=0;
1976        UParseError         pe;
1977        UErrorCode          status=U_ZERO_ERROR;
1978        UText               re=UTEXT_INITIALIZER;
1979        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1980        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1981
1982        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1983        REGEX_CHECK_STATUS;
1984
1985        UText input = UTEXT_INITIALIZER;
1986        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1987        utext_openUTF8(&input, str_0123456789, -1, &status);
1988
1989        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1990        REGEX_CHECK_STATUS;
1991        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1992        static const int32_t matchStarts[] = {0,  2, 4, 8};
1993        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1994        int32_t i;
1995        for (i=0; i<4; i++) {
1996            int32_t actualStart = matcher->start(i, status);
1997            REGEX_CHECK_STATUS;
1998            if (actualStart != matchStarts[i]) {
1999                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2000                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
2001            }
2002            int32_t actualEnd = matcher->end(i, status);
2003            REGEX_CHECK_STATUS;
2004            if (actualEnd != matchEnds[i]) {
2005                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2006                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2007            }
2008        }
2009
2010        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2011        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2012
2013        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2014        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2015        matcher->reset();
2016        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2017
2018        matcher->lookingAt(status);
2019
2020        UnicodeString dest;
2021        UText destText = UTEXT_INITIALIZER;
2022        utext_openUnicodeString(&destText, &dest, &status);
2023        UText *result;
2024        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2025        //	Test shallow-clone API
2026        int64_t   group_len;
2027        result = matcher->group((UText *)NULL, group_len, status);
2028        REGEX_CHECK_STATUS;
2029        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2030        utext_close(result);
2031        result = matcher->group(0, &destText, group_len, status);
2032        REGEX_CHECK_STATUS;
2033        REGEX_ASSERT(result == &destText);
2034        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035        //  destText is now immutable, reopen it
2036        utext_close(&destText);
2037        utext_openUnicodeString(&destText, &dest, &status);
2038
2039        result = matcher->group(0, NULL, status);
2040        REGEX_CHECK_STATUS;
2041        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2042        utext_close(result);
2043        result = matcher->group(0, &destText, status);
2044        REGEX_CHECK_STATUS;
2045        REGEX_ASSERT(result == &destText);
2046        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047
2048        result = matcher->group(1, NULL, status);
2049        REGEX_CHECK_STATUS;
2050        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2051        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2052        utext_close(result);
2053        result = matcher->group(1, &destText, status);
2054        REGEX_CHECK_STATUS;
2055        REGEX_ASSERT(result == &destText);
2056        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057
2058        result = matcher->group(2, NULL, status);
2059        REGEX_CHECK_STATUS;
2060        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2061        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2062        utext_close(result);
2063        result = matcher->group(2, &destText, status);
2064        REGEX_CHECK_STATUS;
2065        REGEX_ASSERT(result == &destText);
2066        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067
2068        result = matcher->group(3, NULL, status);
2069        REGEX_CHECK_STATUS;
2070        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2071        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2072        utext_close(result);
2073        result = matcher->group(3, &destText, status);
2074        REGEX_CHECK_STATUS;
2075        REGEX_ASSERT(result == &destText);
2076        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077
2078        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2079        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2080        matcher->reset();
2081        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2082
2083        delete matcher;
2084        delete pat;
2085
2086        utext_close(&destText);
2087        utext_close(&input);
2088        utext_close(&re);
2089    }
2090
2091    //
2092    //  find
2093    //
2094    {
2095        int32_t             flags=0;
2096        UParseError         pe;
2097        UErrorCode          status=U_ZERO_ERROR;
2098        UText               re=UTEXT_INITIALIZER;
2099        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2100        utext_openUTF8(&re, str_abc, -1, &status);
2101
2102        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2103        REGEX_CHECK_STATUS;
2104        UText input = UTEXT_INITIALIZER;
2105        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2106        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2107        //                      012345678901234567
2108
2109        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2110        REGEX_CHECK_STATUS;
2111        REGEX_ASSERT(matcher->find());
2112        REGEX_ASSERT(matcher->start(status) == 1);
2113        REGEX_ASSERT(matcher->find());
2114        REGEX_ASSERT(matcher->start(status) == 6);
2115        REGEX_ASSERT(matcher->find());
2116        REGEX_ASSERT(matcher->start(status) == 12);
2117        REGEX_ASSERT(matcher->find() == FALSE);
2118        REGEX_ASSERT(matcher->find() == FALSE);
2119
2120        matcher->reset();
2121        REGEX_ASSERT(matcher->find());
2122        REGEX_ASSERT(matcher->start(status) == 1);
2123
2124        REGEX_ASSERT(matcher->find(0, status));
2125        REGEX_ASSERT(matcher->start(status) == 1);
2126        REGEX_ASSERT(matcher->find(1, status));
2127        REGEX_ASSERT(matcher->start(status) == 1);
2128        REGEX_ASSERT(matcher->find(2, status));
2129        REGEX_ASSERT(matcher->start(status) == 6);
2130        REGEX_ASSERT(matcher->find(12, status));
2131        REGEX_ASSERT(matcher->start(status) == 12);
2132        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2133        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2134        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2135        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2136
2137        status = U_ZERO_ERROR;
2138        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139        status = U_ZERO_ERROR;
2140        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2141
2142        REGEX_ASSERT(matcher->groupCount() == 0);
2143
2144        delete matcher;
2145        delete pat;
2146
2147        utext_close(&input);
2148        utext_close(&re);
2149    }
2150
2151
2152    //
2153    //  find, with \G in pattern (true if at the end of a previous match).
2154    //
2155    {
2156        int32_t             flags=0;
2157        UParseError         pe;
2158        UErrorCode          status=U_ZERO_ERROR;
2159        UText               re=UTEXT_INITIALIZER;
2160        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2161        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2162
2163        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2164
2165        REGEX_CHECK_STATUS;
2166        UText input = UTEXT_INITIALIZER;
2167        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2168        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2169        //                      012345678901234567
2170
2171        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2172        REGEX_CHECK_STATUS;
2173        REGEX_ASSERT(matcher->find());
2174        REGEX_ASSERT(matcher->start(status) == 0);
2175        REGEX_ASSERT(matcher->start(1, status) == -1);
2176        REGEX_ASSERT(matcher->start(2, status) == 1);
2177
2178        REGEX_ASSERT(matcher->find());
2179        REGEX_ASSERT(matcher->start(status) == 4);
2180        REGEX_ASSERT(matcher->start(1, status) == 4);
2181        REGEX_ASSERT(matcher->start(2, status) == -1);
2182        REGEX_CHECK_STATUS;
2183
2184        delete matcher;
2185        delete pat;
2186
2187        utext_close(&input);
2188        utext_close(&re);
2189    }
2190
2191    //
2192    //   find with zero length matches, match position should bump ahead
2193    //     to prevent loops.
2194    //
2195    {
2196        int32_t                 i;
2197        UErrorCode          status=U_ZERO_ERROR;
2198        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2199                                                      //   using an always-true look-ahead.
2200        REGEX_CHECK_STATUS;
2201        UText s = UTEXT_INITIALIZER;
2202        utext_openUTF8(&s, "    ", -1, &status);
2203        m.reset(&s);
2204        for (i=0; ; i++) {
2205            if (m.find() == FALSE) {
2206                break;
2207            }
2208            REGEX_ASSERT(m.start(status) == i);
2209            REGEX_ASSERT(m.end(status) == i);
2210        }
2211        REGEX_ASSERT(i==5);
2212
2213        // Check that the bump goes over characters outside the BMP OK
2214        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2215        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2216        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2217        m.reset(&s);
2218        for (i=0; ; i+=4) {
2219            if (m.find() == FALSE) {
2220                break;
2221            }
2222            REGEX_ASSERT(m.start(status) == i);
2223            REGEX_ASSERT(m.end(status) == i);
2224        }
2225        REGEX_ASSERT(i==20);
2226
2227        utext_close(&s);
2228    }
2229    {
2230        // find() loop breaking test.
2231        //        with pattern of /.?/, should see a series of one char matches, then a single
2232        //        match of zero length at the end of the input string.
2233        int32_t                 i;
2234        UErrorCode          status=U_ZERO_ERROR;
2235        RegexMatcher        m(".?", 0, status);
2236        REGEX_CHECK_STATUS;
2237        UText s = UTEXT_INITIALIZER;
2238        utext_openUTF8(&s, "    ", -1, &status);
2239        m.reset(&s);
2240        for (i=0; ; i++) {
2241            if (m.find() == FALSE) {
2242                break;
2243            }
2244            REGEX_ASSERT(m.start(status) == i);
2245            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2246        }
2247        REGEX_ASSERT(i==5);
2248
2249        utext_close(&s);
2250    }
2251
2252
2253    //
2254    // Matchers with no input string behave as if they had an empty input string.
2255    //
2256
2257    {
2258        UErrorCode status = U_ZERO_ERROR;
2259        RegexMatcher  m(".?", 0, status);
2260        REGEX_CHECK_STATUS;
2261        REGEX_ASSERT(m.find());
2262        REGEX_ASSERT(m.start(status) == 0);
2263        REGEX_ASSERT(m.input() == "");
2264    }
2265    {
2266        UErrorCode status = U_ZERO_ERROR;
2267        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2268        RegexMatcher  *m = p->matcher(status);
2269        REGEX_CHECK_STATUS;
2270
2271        REGEX_ASSERT(m->find() == FALSE);
2272        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2273        delete m;
2274        delete p;
2275    }
2276
2277    //
2278    // Regions
2279    //
2280    {
2281        UErrorCode status = U_ZERO_ERROR;
2282        UText testPattern = UTEXT_INITIALIZER;
2283        UText testText    = UTEXT_INITIALIZER;
2284        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2285        REGEX_VERBOSE_TEXT(&testPattern);
2286        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2287        REGEX_VERBOSE_TEXT(&testText);
2288
2289        RegexMatcher m(&testPattern, &testText, 0, status);
2290        REGEX_CHECK_STATUS;
2291        REGEX_ASSERT(m.regionStart() == 0);
2292        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2293        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2294        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2295
2296        m.region(2,4, status);
2297        REGEX_CHECK_STATUS;
2298        REGEX_ASSERT(m.matches(status));
2299        REGEX_ASSERT(m.start(status)==2);
2300        REGEX_ASSERT(m.end(status)==4);
2301        REGEX_CHECK_STATUS;
2302
2303        m.reset();
2304        REGEX_ASSERT(m.regionStart() == 0);
2305        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2306
2307        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2308        REGEX_VERBOSE_TEXT(&testText);
2309        m.reset(&testText);
2310        REGEX_ASSERT(m.regionStart() == 0);
2311        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2312
2313        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2314        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2315        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316        REGEX_ASSERT(&m == &m.reset());
2317        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2318
2319        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2320        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321        REGEX_ASSERT(&m == &m.reset());
2322        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2323
2324        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2326        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327        REGEX_ASSERT(&m == &m.reset());
2328        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2329
2330        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2331        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332        REGEX_ASSERT(&m == &m.reset());
2333        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2334
2335        utext_close(&testText);
2336        utext_close(&testPattern);
2337    }
2338
2339    //
2340    // hitEnd() and requireEnd()
2341    //
2342    {
2343        UErrorCode status = U_ZERO_ERROR;
2344        UText testPattern = UTEXT_INITIALIZER;
2345        UText testText    = UTEXT_INITIALIZER;
2346        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2347        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2348        utext_openUTF8(&testPattern, str_, -1, &status);
2349        utext_openUTF8(&testText, str_aabb, -1, &status);
2350
2351        RegexMatcher m1(&testPattern, &testText,  0, status);
2352        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2353        REGEX_ASSERT(m1.hitEnd() == TRUE);
2354        REGEX_ASSERT(m1.requireEnd() == FALSE);
2355        REGEX_CHECK_STATUS;
2356
2357        status = U_ZERO_ERROR;
2358        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2359        utext_openUTF8(&testPattern, str_a, -1, &status);
2360        RegexMatcher m2(&testPattern, &testText, 0, status);
2361        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2362        REGEX_ASSERT(m2.hitEnd() == FALSE);
2363        REGEX_ASSERT(m2.requireEnd() == FALSE);
2364        REGEX_CHECK_STATUS;
2365
2366        status = U_ZERO_ERROR;
2367        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2368        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2369        RegexMatcher m3(&testPattern, &testText, 0, status);
2370        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2371        REGEX_ASSERT(m3.hitEnd() == TRUE);
2372        REGEX_ASSERT(m3.requireEnd() == TRUE);
2373        REGEX_CHECK_STATUS;
2374
2375        utext_close(&testText);
2376        utext_close(&testPattern);
2377    }
2378}
2379
2380
2381//---------------------------------------------------------------------------
2382//
2383//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2384//                         Replace family of functions.
2385//
2386//---------------------------------------------------------------------------
2387void RegexTest::API_Replace_UTF8() {
2388    //
2389    //  Replace
2390    //
2391    int32_t             flags=0;
2392    UParseError         pe;
2393    UErrorCode          status=U_ZERO_ERROR;
2394
2395    UText               re=UTEXT_INITIALIZER;
2396    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2397    REGEX_VERBOSE_TEXT(&re);
2398    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2399    REGEX_CHECK_STATUS;
2400
2401    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2402    //             012345678901234567
2403    UText dataText = UTEXT_INITIALIZER;
2404    utext_openUTF8(&dataText, data, -1, &status);
2405    REGEX_CHECK_STATUS;
2406    REGEX_VERBOSE_TEXT(&dataText);
2407    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2408
2409    //
2410    //  Plain vanilla matches.
2411    //
2412    UnicodeString  dest;
2413    UText destText = UTEXT_INITIALIZER;
2414    utext_openUnicodeString(&destText, &dest, &status);
2415    UText *result;
2416
2417    UText replText = UTEXT_INITIALIZER;
2418
2419    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2420    utext_openUTF8(&replText, str_yz, -1, &status);
2421    REGEX_VERBOSE_TEXT(&replText);
2422    result = matcher->replaceFirst(&replText, NULL, status);
2423    REGEX_CHECK_STATUS;
2424    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2425    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426    utext_close(result);
2427    result = matcher->replaceFirst(&replText, &destText, status);
2428    REGEX_CHECK_STATUS;
2429    REGEX_ASSERT(result == &destText);
2430    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431
2432    result = matcher->replaceAll(&replText, NULL, status);
2433    REGEX_CHECK_STATUS;
2434    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2435    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2436    utext_close(result);
2437
2438    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2439    result = matcher->replaceAll(&replText, &destText, status);
2440    REGEX_CHECK_STATUS;
2441    REGEX_ASSERT(result == &destText);
2442    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2443
2444    //
2445    //  Plain vanilla non-matches.
2446    //
2447    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2448    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2449    matcher->reset(&dataText);
2450
2451    result = matcher->replaceFirst(&replText, NULL, status);
2452    REGEX_CHECK_STATUS;
2453    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454    utext_close(result);
2455    result = matcher->replaceFirst(&replText, &destText, status);
2456    REGEX_CHECK_STATUS;
2457    REGEX_ASSERT(result == &destText);
2458    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459
2460    result = matcher->replaceAll(&replText, NULL, status);
2461    REGEX_CHECK_STATUS;
2462    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463    utext_close(result);
2464    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2465    result = matcher->replaceAll(&replText, &destText, status);
2466    REGEX_CHECK_STATUS;
2467    REGEX_ASSERT(result == &destText);
2468    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2469
2470    //
2471    // Empty source string
2472    //
2473    utext_openUTF8(&dataText, NULL, 0, &status);
2474    matcher->reset(&dataText);
2475
2476    result = matcher->replaceFirst(&replText, NULL, status);
2477    REGEX_CHECK_STATUS;
2478    REGEX_ASSERT_UTEXT_UTF8("", result);
2479    utext_close(result);
2480    result = matcher->replaceFirst(&replText, &destText, status);
2481    REGEX_CHECK_STATUS;
2482    REGEX_ASSERT(result == &destText);
2483    REGEX_ASSERT_UTEXT_UTF8("", result);
2484
2485    result = matcher->replaceAll(&replText, NULL, status);
2486    REGEX_CHECK_STATUS;
2487    REGEX_ASSERT_UTEXT_UTF8("", result);
2488    utext_close(result);
2489    result = matcher->replaceAll(&replText, &destText, status);
2490    REGEX_CHECK_STATUS;
2491    REGEX_ASSERT(result == &destText);
2492    REGEX_ASSERT_UTEXT_UTF8("", result);
2493
2494    //
2495    // Empty substitution string
2496    //
2497    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2498    matcher->reset(&dataText);
2499
2500    utext_openUTF8(&replText, NULL, 0, &status);
2501    result = matcher->replaceFirst(&replText, NULL, status);
2502    REGEX_CHECK_STATUS;
2503    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2504    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505    utext_close(result);
2506    result = matcher->replaceFirst(&replText, &destText, status);
2507    REGEX_CHECK_STATUS;
2508    REGEX_ASSERT(result == &destText);
2509    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510
2511    result = matcher->replaceAll(&replText, NULL, status);
2512    REGEX_CHECK_STATUS;
2513    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2514    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515    utext_close(result);
2516    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2517    result = matcher->replaceAll(&replText, &destText, status);
2518    REGEX_CHECK_STATUS;
2519    REGEX_ASSERT(result == &destText);
2520    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2521
2522    //
2523    // match whole string
2524    //
2525    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2526    utext_openUTF8(&dataText, str_abc, -1, &status);
2527    matcher->reset(&dataText);
2528
2529    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2530    utext_openUTF8(&replText, str_xyz, -1, &status);
2531    result = matcher->replaceFirst(&replText, NULL, status);
2532    REGEX_CHECK_STATUS;
2533    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534    utext_close(result);
2535    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2536    result = matcher->replaceFirst(&replText, &destText, status);
2537    REGEX_CHECK_STATUS;
2538    REGEX_ASSERT(result == &destText);
2539    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2540
2541    result = matcher->replaceAll(&replText, NULL, status);
2542    REGEX_CHECK_STATUS;
2543    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544    utext_close(result);
2545    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2546    result = matcher->replaceAll(&replText, &destText, status);
2547    REGEX_CHECK_STATUS;
2548    REGEX_ASSERT(result == &destText);
2549    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2550
2551    //
2552    // Capture Group, simple case
2553    //
2554    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2555    utext_openUTF8(&re, str_add, -1, &status);
2556    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2557    REGEX_CHECK_STATUS;
2558
2559    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2560    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2561    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2562    REGEX_CHECK_STATUS;
2563
2564    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2565    utext_openUTF8(&replText, str_11, -1, &status);
2566    result = matcher2->replaceFirst(&replText, NULL, status);
2567    REGEX_CHECK_STATUS;
2568    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2569    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570    utext_close(result);
2571    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2572    result = matcher2->replaceFirst(&replText, &destText, status);
2573    REGEX_CHECK_STATUS;
2574    REGEX_ASSERT(result == &destText);
2575    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2576
2577    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2578    utext_openUTF8(&replText, str_v, -1, &status);
2579    REGEX_VERBOSE_TEXT(&replText);
2580    result = matcher2->replaceFirst(&replText, NULL, status);
2581    REGEX_CHECK_STATUS;
2582    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2583    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584    utext_close(result);
2585    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2586    result = matcher2->replaceFirst(&replText, &destText, status);
2587    REGEX_CHECK_STATUS;
2588    REGEX_ASSERT(result == &destText);
2589    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2590
2591    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2592    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593    result = matcher2->replaceFirst(&replText, NULL, status);
2594    REGEX_CHECK_STATUS;
2595    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597    utext_close(result);
2598    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599    result = matcher2->replaceFirst(&replText, &destText, status);
2600    REGEX_CHECK_STATUS;
2601    REGEX_ASSERT(result == &destText);
2602    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603
2604    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606    //                                 012345678901234567890123456
2607    supplDigitChars[22] = 0xF0;
2608    supplDigitChars[23] = 0x9D;
2609    supplDigitChars[24] = 0x9F;
2610    supplDigitChars[25] = 0x8F;
2611    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612
2613    result = matcher2->replaceFirst(&replText, NULL, status);
2614    REGEX_CHECK_STATUS;
2615    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617    utext_close(result);
2618    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619    result = matcher2->replaceFirst(&replText, &destText, status);
2620    REGEX_CHECK_STATUS;
2621    REGEX_ASSERT(result == &destText);
2622    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2624    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627    utext_close(result);
2628    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630    REGEX_ASSERT(result == &destText);
2631//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632
2633    //
2634    // Replacement String with \u hex escapes
2635    //
2636    {
2637      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640        utext_openUTF8(&replText, str_u0043, -1, &status);
2641        matcher->reset(&dataText);
2642
2643        result = matcher->replaceAll(&replText, NULL, status);
2644        REGEX_CHECK_STATUS;
2645        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647        utext_close(result);
2648        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649        result = matcher->replaceAll(&replText, &destText, status);
2650        REGEX_CHECK_STATUS;
2651        REGEX_ASSERT(result == &destText);
2652        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653    }
2654    {
2655      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656        utext_openUTF8(&dataText, str_abc, -1, &status);
2657        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658        utext_openUTF8(&replText, str_U00010000, -1, &status);
2659        matcher->reset(&dataText);
2660
2661        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662        //                          0123456789
2663        expected[2] = 0xF0;
2664        expected[3] = 0x90;
2665        expected[4] = 0x80;
2666        expected[5] = 0x80;
2667
2668        result = matcher->replaceAll(&replText, NULL, status);
2669        REGEX_CHECK_STATUS;
2670        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671        utext_close(result);
2672        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673        result = matcher->replaceAll(&replText, &destText, status);
2674        REGEX_CHECK_STATUS;
2675        REGEX_ASSERT(result == &destText);
2676        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677    }
2678    // TODO:  need more through testing of capture substitutions.
2679
2680    // Bug 4057
2681    //
2682    {
2683        status = U_ZERO_ERROR;
2684const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687        utext_openUTF8(&re, str_ssee, -1, &status);
2688        utext_openUTF8(&dataText, str_blah, -1, &status);
2689        utext_openUTF8(&replText, str_ooh, -1, &status);
2690
2691        RegexMatcher m(&re, 0, status);
2692        REGEX_CHECK_STATUS;
2693
2694        UnicodeString result;
2695        UText resultText = UTEXT_INITIALIZER;
2696        utext_openUnicodeString(&resultText, &result, &status);
2697
2698        // Multiple finds do NOT bump up the previous appendReplacement postion.
2699        m.reset(&dataText);
2700        m.find();
2701        m.find();
2702        m.appendReplacement(&resultText, &replText, status);
2703        REGEX_CHECK_STATUS;
2704        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706
2707        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708        status = U_ZERO_ERROR;
2709        result.truncate(0);
2710        utext_openUnicodeString(&resultText, &result, &status);
2711        m.reset(10, status);
2712        m.find();
2713        m.find();
2714        m.appendReplacement(&resultText, &replText, status);
2715        REGEX_CHECK_STATUS;
2716        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718
2719        // find() at interior of string, appendReplacement still starts at beginning.
2720        status = U_ZERO_ERROR;
2721        result.truncate(0);
2722        utext_openUnicodeString(&resultText, &result, &status);
2723        m.reset();
2724        m.find(10, status);
2725        m.find();
2726        m.appendReplacement(&resultText, &replText, status);
2727        REGEX_CHECK_STATUS;
2728        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730
2731        m.appendTail(&resultText, status);
2732        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734
2735        utext_close(&resultText);
2736    }
2737
2738    delete matcher2;
2739    delete pat2;
2740    delete matcher;
2741    delete pat;
2742
2743    utext_close(&dataText);
2744    utext_close(&replText);
2745    utext_close(&destText);
2746    utext_close(&re);
2747}
2748
2749
2750//---------------------------------------------------------------------------
2751//
2752//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2753//                        present and nominally working.
2754//
2755//---------------------------------------------------------------------------
2756void RegexTest::API_Pattern_UTF8() {
2757    RegexPattern        pata;    // Test default constructor to not crash.
2758    RegexPattern        patb;
2759
2760    REGEX_ASSERT(pata == patb);
2761    REGEX_ASSERT(pata == pata);
2762
2763    UText         re1 = UTEXT_INITIALIZER;
2764    UText         re2 = UTEXT_INITIALIZER;
2765    UErrorCode    status = U_ZERO_ERROR;
2766    UParseError   pe;
2767
2768    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771    utext_openUTF8(&re2, str_def, -1, &status);
2772
2773    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775    REGEX_CHECK_STATUS;
2776    REGEX_ASSERT(*pat1 == *pat1);
2777    REGEX_ASSERT(*pat1 != pata);
2778
2779    // Assign
2780    patb = *pat1;
2781    REGEX_ASSERT(patb == *pat1);
2782
2783    // Copy Construct
2784    RegexPattern patc(*pat1);
2785    REGEX_ASSERT(patc == *pat1);
2786    REGEX_ASSERT(patb == patc);
2787    REGEX_ASSERT(pat1 != pat2);
2788    patb = *pat2;
2789    REGEX_ASSERT(patb != patc);
2790    REGEX_ASSERT(patb == *pat2);
2791
2792    // Compile with no flags.
2793    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2794    REGEX_ASSERT(*pat1a == *pat1);
2795
2796    REGEX_ASSERT(pat1a->flags() == 0);
2797
2798    // Compile with different flags should be not equal
2799    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800    REGEX_CHECK_STATUS;
2801
2802    REGEX_ASSERT(*pat1b != *pat1a);
2803    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804    REGEX_ASSERT(pat1a->flags() == 0);
2805    delete pat1b;
2806
2807    // clone
2808    RegexPattern *pat1c = pat1->clone();
2809    REGEX_ASSERT(*pat1c == *pat1);
2810    REGEX_ASSERT(*pat1c != *pat2);
2811
2812    delete pat1c;
2813    delete pat1a;
2814    delete pat1;
2815    delete pat2;
2816
2817    utext_close(&re1);
2818    utext_close(&re2);
2819
2820
2821    //
2822    //   Verify that a matcher created from a cloned pattern works.
2823    //     (Jitterbug 3423)
2824    //
2825    {
2826        UErrorCode     status     = U_ZERO_ERROR;
2827        UText          pattern    = UTEXT_INITIALIZER;
2828        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829        utext_openUTF8(&pattern, str_pL, -1, &status);
2830
2831        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2832        RegexPattern  *pClone     = pSource->clone();
2833        delete         pSource;
2834        RegexMatcher  *mFromClone = pClone->matcher(status);
2835        REGEX_CHECK_STATUS;
2836
2837        UText          input      = UTEXT_INITIALIZER;
2838        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840        mFromClone->reset(&input);
2841        REGEX_ASSERT(mFromClone->find() == TRUE);
2842        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843        REGEX_ASSERT(mFromClone->find() == TRUE);
2844        REGEX_ASSERT(mFromClone->group(status) == "World");
2845        REGEX_ASSERT(mFromClone->find() == FALSE);
2846        delete mFromClone;
2847        delete pClone;
2848
2849        utext_close(&input);
2850        utext_close(&pattern);
2851    }
2852
2853    //
2854    //   matches convenience API
2855    //
2856    {
2857        UErrorCode status  = U_ZERO_ERROR;
2858        UText      pattern = UTEXT_INITIALIZER;
2859        UText      input   = UTEXT_INITIALIZER;
2860
2861        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862        utext_openUTF8(&input, str_randominput, -1, &status);
2863
2864        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867        REGEX_CHECK_STATUS;
2868
2869        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870        utext_openUTF8(&pattern, str_abc, -1, &status);
2871        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872        REGEX_CHECK_STATUS;
2873
2874        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875        utext_openUTF8(&pattern, str_nput, -1, &status);
2876        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877        REGEX_CHECK_STATUS;
2878
2879        utext_openUTF8(&pattern, str_randominput, -1, &status);
2880        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881        REGEX_CHECK_STATUS;
2882
2883        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884        utext_openUTF8(&pattern, str_u, -1, &status);
2885        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886        REGEX_CHECK_STATUS;
2887
2888        utext_openUTF8(&input, str_abc, -1, &status);
2889        utext_openUTF8(&pattern, str_abc, -1, &status);
2890        status = U_INDEX_OUTOFBOUNDS_ERROR;
2891        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893
2894        utext_close(&input);
2895        utext_close(&pattern);
2896    }
2897
2898
2899    //
2900    // Split()
2901    //
2902    status = U_ZERO_ERROR;
2903    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2904    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905    pat1 = RegexPattern::compile(&re1, pe, status);
2906    REGEX_CHECK_STATUS;
2907    UnicodeString  fields[10];
2908
2909    int32_t n;
2910    n = pat1->split("Now is the time", fields, 10, status);
2911    REGEX_CHECK_STATUS;
2912    REGEX_ASSERT(n==4);
2913    REGEX_ASSERT(fields[0]=="Now");
2914    REGEX_ASSERT(fields[1]=="is");
2915    REGEX_ASSERT(fields[2]=="the");
2916    REGEX_ASSERT(fields[3]=="time");
2917    REGEX_ASSERT(fields[4]=="");
2918
2919    n = pat1->split("Now is the time", fields, 2, status);
2920    REGEX_CHECK_STATUS;
2921    REGEX_ASSERT(n==2);
2922    REGEX_ASSERT(fields[0]=="Now");
2923    REGEX_ASSERT(fields[1]=="is the time");
2924    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2925
2926    fields[1] = "*";
2927    status = U_ZERO_ERROR;
2928    n = pat1->split("Now is the time", fields, 1, status);
2929    REGEX_CHECK_STATUS;
2930    REGEX_ASSERT(n==1);
2931    REGEX_ASSERT(fields[0]=="Now is the time");
2932    REGEX_ASSERT(fields[1]=="*");
2933    status = U_ZERO_ERROR;
2934
2935    n = pat1->split("    Now       is the time   ", fields, 10, status);
2936    REGEX_CHECK_STATUS;
2937    REGEX_ASSERT(n==6);
2938    REGEX_ASSERT(fields[0]=="");
2939    REGEX_ASSERT(fields[1]=="Now");
2940    REGEX_ASSERT(fields[2]=="is");
2941    REGEX_ASSERT(fields[3]=="the");
2942    REGEX_ASSERT(fields[4]=="time");
2943    REGEX_ASSERT(fields[5]=="");
2944    REGEX_ASSERT(fields[6]=="");
2945
2946    fields[2] = "*";
2947    n = pat1->split("     ", fields, 10, status);
2948    REGEX_CHECK_STATUS;
2949    REGEX_ASSERT(n==2);
2950    REGEX_ASSERT(fields[0]=="");
2951    REGEX_ASSERT(fields[1]=="");
2952    REGEX_ASSERT(fields[2]=="*");
2953
2954    fields[0] = "foo";
2955    n = pat1->split("", fields, 10, status);
2956    REGEX_CHECK_STATUS;
2957    REGEX_ASSERT(n==0);
2958    REGEX_ASSERT(fields[0]=="foo");
2959
2960    delete pat1;
2961
2962    //  split, with a pattern with (capture)
2963    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964    pat1 = RegexPattern::compile(&re1,  pe, status);
2965    REGEX_CHECK_STATUS;
2966
2967    status = U_ZERO_ERROR;
2968    fields[6] = fields[7] = "*";
2969    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970    REGEX_CHECK_STATUS;
2971    REGEX_ASSERT(n==7);
2972    REGEX_ASSERT(fields[0]=="");
2973    REGEX_ASSERT(fields[1]=="a");
2974    REGEX_ASSERT(fields[2]=="Now is ");
2975    REGEX_ASSERT(fields[3]=="b");
2976    REGEX_ASSERT(fields[4]=="the time");
2977    REGEX_ASSERT(fields[5]=="c");
2978    REGEX_ASSERT(fields[6]=="");
2979    REGEX_ASSERT(fields[7]=="*");
2980    REGEX_ASSERT(status==U_ZERO_ERROR);
2981
2982    fields[6] = fields[7] = "*";
2983    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2984    REGEX_CHECK_STATUS;
2985    REGEX_ASSERT(n==7);
2986    REGEX_ASSERT(fields[0]=="  ");
2987    REGEX_ASSERT(fields[1]=="a");
2988    REGEX_ASSERT(fields[2]=="Now is ");
2989    REGEX_ASSERT(fields[3]=="b");
2990    REGEX_ASSERT(fields[4]=="the time");
2991    REGEX_ASSERT(fields[5]=="c");
2992    REGEX_ASSERT(fields[6]=="");
2993    REGEX_ASSERT(fields[7]=="*");
2994
2995    status = U_ZERO_ERROR;
2996    fields[6] = "foo";
2997    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2998    REGEX_CHECK_STATUS;
2999    REGEX_ASSERT(n==6);
3000    REGEX_ASSERT(fields[0]=="  ");
3001    REGEX_ASSERT(fields[1]=="a");
3002    REGEX_ASSERT(fields[2]=="Now is ");
3003    REGEX_ASSERT(fields[3]=="b");
3004    REGEX_ASSERT(fields[4]=="the time");
3005    REGEX_ASSERT(fields[5]==" ");
3006    REGEX_ASSERT(fields[6]=="foo");
3007
3008    status = U_ZERO_ERROR;
3009    fields[5] = "foo";
3010    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3011    REGEX_CHECK_STATUS;
3012    REGEX_ASSERT(n==5);
3013    REGEX_ASSERT(fields[0]=="  ");
3014    REGEX_ASSERT(fields[1]=="a");
3015    REGEX_ASSERT(fields[2]=="Now is ");
3016    REGEX_ASSERT(fields[3]=="b");
3017    REGEX_ASSERT(fields[4]=="the time<c>");
3018    REGEX_ASSERT(fields[5]=="foo");
3019
3020    status = U_ZERO_ERROR;
3021    fields[5] = "foo";
3022    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3023    REGEX_CHECK_STATUS;
3024    REGEX_ASSERT(n==5);
3025    REGEX_ASSERT(fields[0]=="  ");
3026    REGEX_ASSERT(fields[1]=="a");
3027    REGEX_ASSERT(fields[2]=="Now is ");
3028    REGEX_ASSERT(fields[3]=="b");
3029    REGEX_ASSERT(fields[4]=="the time");
3030    REGEX_ASSERT(fields[5]=="foo");
3031
3032    status = U_ZERO_ERROR;
3033    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3034    REGEX_CHECK_STATUS;
3035    REGEX_ASSERT(n==4);
3036    REGEX_ASSERT(fields[0]=="  ");
3037    REGEX_ASSERT(fields[1]=="a");
3038    REGEX_ASSERT(fields[2]=="Now is ");
3039    REGEX_ASSERT(fields[3]=="the time<c>");
3040    status = U_ZERO_ERROR;
3041    delete pat1;
3042
3043    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044    pat1 = RegexPattern::compile(&re1, pe, status);
3045    REGEX_CHECK_STATUS;
3046    n = pat1->split("1-10,20", fields, 10, status);
3047    REGEX_CHECK_STATUS;
3048    REGEX_ASSERT(n==5);
3049    REGEX_ASSERT(fields[0]=="1");
3050    REGEX_ASSERT(fields[1]=="-");
3051    REGEX_ASSERT(fields[2]=="10");
3052    REGEX_ASSERT(fields[3]==",");
3053    REGEX_ASSERT(fields[4]=="20");
3054    delete pat1;
3055
3056
3057    //
3058    // RegexPattern::pattern() and patternText()
3059    //
3060    pat1 = new RegexPattern();
3061    REGEX_ASSERT(pat1->pattern() == "");
3062    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3063    delete pat1;
3064    const char *helloWorldInvariant = "(Hello, world)*";
3065    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3066    pat1 = RegexPattern::compile(&re1, pe, status);
3067    REGEX_CHECK_STATUS;
3068    REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3069    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3070    delete pat1;
3071
3072    utext_close(&re1);
3073}
3074
3075
3076//---------------------------------------------------------------------------
3077//
3078//      Extended       A more thorough check for features of regex patterns
3079//                     The test cases are in a separate data file,
3080//                       source/tests/testdata/regextst.txt
3081//                     A description of the test data format is included in that file.
3082//
3083//---------------------------------------------------------------------------
3084
3085const char *
3086RegexTest::getPath(char buffer[2048], const char *filename) {
3087    UErrorCode status=U_ZERO_ERROR;
3088    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3089    if (U_FAILURE(status)) {
3090        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3091        return NULL;
3092    }
3093
3094    strcpy(buffer, testDataDirectory);
3095    strcat(buffer, filename);
3096    return buffer;
3097}
3098
3099void RegexTest::Extended() {
3100    char tdd[2048];
3101    const char *srcPath;
3102    UErrorCode  status  = U_ZERO_ERROR;
3103    int32_t     lineNum = 0;
3104
3105    //
3106    //  Open and read the test data file.
3107    //
3108    srcPath=getPath(tdd, "regextst.txt");
3109    if(srcPath==NULL) {
3110        return; /* something went wrong, error already output */
3111    }
3112
3113    int32_t    len;
3114    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3115    if (U_FAILURE(status)) {
3116        return; /* something went wrong, error already output */
3117    }
3118
3119    //
3120    //  Put the test data into a UnicodeString
3121    //
3122    UnicodeString testString(FALSE, testData, len);
3123
3124    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3125    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3126    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3127
3128    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3129    UnicodeString   testPattern;   // The pattern for test from the test file.
3130    UnicodeString   testFlags;     // the flags   for a test.
3131    UnicodeString   matchString;   // The marked up string to be used as input
3132
3133    if (U_FAILURE(status)){
3134        dataerrln("Construct RegexMatcher() error.");
3135        delete [] testData;
3136        return;
3137    }
3138
3139    //
3140    //  Loop over the test data file, once per line.
3141    //
3142    while (lineMat.find()) {
3143        lineNum++;
3144        if (U_FAILURE(status)) {
3145          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3146        }
3147
3148        status = U_ZERO_ERROR;
3149        UnicodeString testLine = lineMat.group(1, status);
3150        if (testLine.length() == 0) {
3151            continue;
3152        }
3153
3154        //
3155        // Parse the test line.  Skip blank and comment only lines.
3156        // Separate out the three main fields - pattern, flags, target.
3157        //
3158
3159        commentMat.reset(testLine);
3160        if (commentMat.lookingAt(status)) {
3161            // This line is a comment, or blank.
3162            continue;
3163        }
3164
3165        //
3166        //  Pull out the pattern field, remove it from the test file line.
3167        //
3168        quotedStuffMat.reset(testLine);
3169        if (quotedStuffMat.lookingAt(status)) {
3170            testPattern = quotedStuffMat.group(2, status);
3171            testLine.remove(0, quotedStuffMat.end(0, status));
3172        } else {
3173            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3174            continue;
3175        }
3176
3177
3178        //
3179        //  Pull out the flags from the test file line.
3180        //
3181        flagsMat.reset(testLine);
3182        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3183        testFlags = flagsMat.group(1, status);
3184        if (flagsMat.group(2, status).length() > 0) {
3185            errln("Bad Match flag at line %d. Scanning %c\n",
3186                lineNum, flagsMat.group(2, status).charAt(0));
3187            continue;
3188        }
3189        testLine.remove(0, flagsMat.end(0, status));
3190
3191        //
3192        //  Pull out the match string, as a whole.
3193        //    We'll process the <tags> later.
3194        //
3195        quotedStuffMat.reset(testLine);
3196        if (quotedStuffMat.lookingAt(status)) {
3197            matchString = quotedStuffMat.group(2, status);
3198            testLine.remove(0, quotedStuffMat.end(0, status));
3199        } else {
3200            errln("Bad match string at test file line %d", lineNum);
3201            continue;
3202        }
3203
3204        //
3205        //  The only thing left from the input line should be an optional trailing comment.
3206        //
3207        commentMat.reset(testLine);
3208        if (commentMat.lookingAt(status) == FALSE) {
3209            errln("Line %d: unexpected characters at end of test line.", lineNum);
3210            continue;
3211        }
3212
3213        //
3214        //  Run the test
3215        //
3216        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3217    }
3218
3219    delete [] testData;
3220
3221}
3222
3223
3224
3225//---------------------------------------------------------------------------
3226//
3227//    regex_find(pattern, flags, inputString, lineNumber)
3228//
3229//         Function to run a single test from the Extended (data driven) tests.
3230//         See file test/testdata/regextst.txt for a description of the
3231//         pattern and inputString fields, and the allowed flags.
3232//         lineNumber is the source line in regextst.txt of the test.
3233//
3234//---------------------------------------------------------------------------
3235
3236
3237//  Set a value into a UVector at position specified by a decimal number in
3238//   a UnicodeString.   This is a utility function needed by the actual test function,
3239//   which follows.
3240static void set(UVector &vec, int32_t val, UnicodeString index) {
3241    UErrorCode  status=U_ZERO_ERROR;
3242    int32_t  idx = 0;
3243    for (int32_t i=0; i<index.length(); i++) {
3244        int32_t d=u_charDigitValue(index.charAt(i));
3245        if (d<0) {return;}
3246        idx = idx*10 + d;
3247    }
3248    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249    vec.setElementAt(val, idx);
3250}
3251
3252static void setInt(UVector &vec, int32_t val, int32_t idx) {
3253    UErrorCode  status=U_ZERO_ERROR;
3254    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3255    vec.setElementAt(val, idx);
3256}
3257
3258static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3259{
3260    UBool couldFind = TRUE;
3261    UTEXT_SETNATIVEINDEX(utext, 0);
3262    int32_t i = 0;
3263    while (i < unistrOffset) {
3264        UChar32 c = UTEXT_NEXT32(utext);
3265        if (c != U_SENTINEL) {
3266            i += U16_LENGTH(c);
3267        } else {
3268            couldFind = FALSE;
3269            break;
3270        }
3271    }
3272    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3273    return couldFind;
3274}
3275
3276
3277void RegexTest::regex_find(const UnicodeString &pattern,
3278                           const UnicodeString &flags,
3279                           const UnicodeString &inputString,
3280                           const char *srcPath,
3281                           int32_t line) {
3282    UnicodeString       unEscapedInput;
3283    UnicodeString       deTaggedInput;
3284
3285    int32_t             patternUTF8Length,      inputUTF8Length;
3286    char                *patternChars  = NULL, *inputChars = NULL;
3287    UText               patternText    = UTEXT_INITIALIZER;
3288    UText               inputText      = UTEXT_INITIALIZER;
3289    UConverter          *UTF8Converter = NULL;
3290
3291    UErrorCode          status         = U_ZERO_ERROR;
3292    UParseError         pe;
3293    RegexPattern        *parsePat      = NULL;
3294    RegexMatcher        *parseMatcher  = NULL;
3295    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3296    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3297    UVector             groupStarts(status);
3298    UVector             groupEnds(status);
3299    UVector             groupStartsUTF8(status);
3300    UVector             groupEndsUTF8(status);
3301    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3302    UBool               failed         = FALSE;
3303    int32_t             numFinds;
3304    int32_t             i;
3305    UBool               useMatchesFunc   = FALSE;
3306    UBool               useLookingAtFunc = FALSE;
3307    int32_t             regionStart      = -1;
3308    int32_t             regionEnd        = -1;
3309    int32_t             regionStartUTF8  = -1;
3310    int32_t             regionEndUTF8    = -1;
3311
3312
3313    //
3314    //  Compile the caller's pattern
3315    //
3316    uint32_t bflags = 0;
3317    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3318        bflags |= UREGEX_CASE_INSENSITIVE;
3319    }
3320    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3321        bflags |= UREGEX_COMMENTS;
3322    }
3323    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3324        bflags |= UREGEX_DOTALL;
3325    }
3326    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3327        bflags |= UREGEX_MULTILINE;
3328    }
3329
3330    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3331        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3332    }
3333    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3334        bflags |= UREGEX_UNIX_LINES;
3335    }
3336    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3337        bflags |= UREGEX_LITERAL;
3338    }
3339
3340
3341    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3342    if (status != U_ZERO_ERROR) {
3343        #if UCONFIG_NO_BREAK_ITERATION==1
3344        // 'v' test flag means that the test pattern should not compile if ICU was configured
3345        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3346        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3347            goto cleanupAndReturn;
3348        }
3349        #endif
3350        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3351            // Expected pattern compilation error.
3352            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3353                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3354            }
3355            goto cleanupAndReturn;
3356        } else {
3357            // Unexpected pattern compilation error.
3358            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3359            goto cleanupAndReturn;
3360        }
3361    }
3362
3363    UTF8Converter = ucnv_open("UTF8", &status);
3364    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3365
3366    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3367    status = U_ZERO_ERROR; // buffer overflow
3368    patternChars = new char[patternUTF8Length+1];
3369    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3370    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3371
3372    if (status == U_ZERO_ERROR) {
3373        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3374
3375        if (status != U_ZERO_ERROR) {
3376#if UCONFIG_NO_BREAK_ITERATION==1
3377            // 'v' test flag means that the test pattern should not compile if ICU was configured
3378            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3379            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3380                goto cleanupAndReturn;
3381            }
3382#endif
3383            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3384                // Expected pattern compilation error.
3385                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3386                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3387                }
3388                goto cleanupAndReturn;
3389            } else {
3390                // Unexpected pattern compilation error.
3391                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3392                goto cleanupAndReturn;
3393            }
3394        }
3395    }
3396
3397    if (UTF8Pattern == NULL) {
3398        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3399        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3400        status = U_ZERO_ERROR;
3401    }
3402
3403    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3404        RegexPatternDump(callerPattern);
3405    }
3406
3407    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3408        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3409        goto cleanupAndReturn;
3410    }
3411
3412
3413    //
3414    // Number of times find() should be called on the test string, default to 1
3415    //
3416    numFinds = 1;
3417    for (i=2; i<=9; i++) {
3418        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3419            if (numFinds != 1) {
3420                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3421                goto cleanupAndReturn;
3422            }
3423            numFinds = i;
3424        }
3425    }
3426
3427    // 'M' flag.  Use matches() instead of find()
3428    if (flags.indexOf((UChar)0x4d) >= 0) {
3429        useMatchesFunc = TRUE;
3430    }
3431    if (flags.indexOf((UChar)0x4c) >= 0) {
3432        useLookingAtFunc = TRUE;
3433    }
3434
3435    //
3436    //  Find the tags in the input data, remove them, and record the group boundary
3437    //    positions.
3438    //
3439    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3440    REGEX_CHECK_STATUS_L(line);
3441
3442    unEscapedInput = inputString.unescape();
3443    parseMatcher = parsePat->matcher(unEscapedInput, status);
3444    REGEX_CHECK_STATUS_L(line);
3445    while(parseMatcher->find()) {
3446        parseMatcher->appendReplacement(deTaggedInput, "", status);
3447        REGEX_CHECK_STATUS;
3448        UnicodeString groupNum = parseMatcher->group(2, status);
3449        if (groupNum == "r") {
3450            // <r> or </r>, a region specification within the string
3451            if (parseMatcher->group(1, status) == "/") {
3452                regionEnd = deTaggedInput.length();
3453            } else {
3454                regionStart = deTaggedInput.length();
3455            }
3456        } else {
3457            // <digits> or </digits>, a group match boundary tag.
3458            if (parseMatcher->group(1, status) == "/") {
3459                set(groupEnds, deTaggedInput.length(), groupNum);
3460            } else {
3461                set(groupStarts, deTaggedInput.length(), groupNum);
3462            }
3463        }
3464    }
3465    parseMatcher->appendTail(deTaggedInput);
3466    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3467    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3468      errln("mismatched <r> tags");
3469      failed = TRUE;
3470      goto cleanupAndReturn;
3471    }
3472
3473    //
3474    //  Configure the matcher according to the flags specified with this test.
3475    //
3476    matcher = callerPattern->matcher(deTaggedInput, status);
3477    REGEX_CHECK_STATUS_L(line);
3478    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3479        matcher->setTrace(TRUE);
3480    }
3481
3482    if (UTF8Pattern != NULL) {
3483        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3484        status = U_ZERO_ERROR; // buffer overflow
3485        inputChars = new char[inputUTF8Length+1];
3486        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3487        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3488
3489        if (status == U_ZERO_ERROR) {
3490            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3491            REGEX_CHECK_STATUS_L(line);
3492        }
3493
3494        if (UTF8Matcher == NULL) {
3495            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3496          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3497            status = U_ZERO_ERROR;
3498        }
3499    }
3500
3501    //
3502    //  Generate native indices for UTF8 versions of region and capture group info
3503    //
3504    if (UTF8Matcher != NULL) {
3505        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3506        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3507
3508        //  Fill out the native index UVector info.
3509        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3510        for (i=0; i<groupStarts.size(); i++) {
3511            int32_t  start = groupStarts.elementAti(i);
3512            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3513            if (start >= 0) {
3514                int32_t  startUTF8;
3515                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3516                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3517                    failed = TRUE;
3518                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3519                }
3520                setInt(groupStartsUTF8, startUTF8, i);
3521            }
3522
3523            int32_t  end = groupEnds.elementAti(i);
3524            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3525            if (end >= 0) {
3526                int32_t  endUTF8;
3527                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3528                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3529                    failed = TRUE;
3530                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3531                }
3532                setInt(groupEndsUTF8, endUTF8, i);
3533            }
3534        }
3535    }
3536
3537    if (regionStart>=0) {
3538       matcher->region(regionStart, regionEnd, status);
3539       REGEX_CHECK_STATUS_L(line);
3540       if (UTF8Matcher != NULL) {
3541           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3542           REGEX_CHECK_STATUS_L(line);
3543       }
3544    }
3545    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3546        matcher->useAnchoringBounds(FALSE);
3547        if (UTF8Matcher != NULL) {
3548            UTF8Matcher->useAnchoringBounds(FALSE);
3549        }
3550    }
3551    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3552        matcher->useTransparentBounds(TRUE);
3553        if (UTF8Matcher != NULL) {
3554            UTF8Matcher->useTransparentBounds(TRUE);
3555        }
3556    }
3557
3558
3559
3560    //
3561    // Do a find on the de-tagged input using the caller's pattern
3562    //     TODO: error on count>1 and not find().
3563    //           error on both matches() and lookingAt().
3564    //
3565    for (i=0; i<numFinds; i++) {
3566        if (useMatchesFunc) {
3567            isMatch = matcher->matches(status);
3568            if (UTF8Matcher != NULL) {
3569               isUTF8Match = UTF8Matcher->matches(status);
3570            }
3571        } else  if (useLookingAtFunc) {
3572            isMatch = matcher->lookingAt(status);
3573            if (UTF8Matcher != NULL) {
3574                isUTF8Match = UTF8Matcher->lookingAt(status);
3575            }
3576        } else {
3577            isMatch = matcher->find();
3578            if (UTF8Matcher != NULL) {
3579                isUTF8Match = UTF8Matcher->find();
3580            }
3581        }
3582    }
3583    matcher->setTrace(FALSE);
3584
3585    //
3586    // Match up the groups from the find() with the groups from the tags
3587    //
3588
3589    // number of tags should match number of groups from find operation.
3590    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3591    //   G option in test means that capture group data is not available in the
3592    //     expected results, so the check needs to be suppressed.
3593    if (isMatch == FALSE && groupStarts.size() != 0) {
3594        dataerrln("Error at line %d:  Match expected, but none found.", line);
3595        failed = TRUE;
3596        goto cleanupAndReturn;
3597    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3598        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3599        failed = TRUE;
3600        goto cleanupAndReturn;
3601    }
3602
3603    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3604        // Only check for match / no match.  Don't check capture groups.
3605        if (isMatch && groupStarts.size() == 0) {
3606            errln("Error at line %d:  No match expected, but one found.", line);
3607            failed = TRUE;
3608        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3609            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3610            failed = TRUE;
3611        }
3612        goto cleanupAndReturn;
3613    }
3614
3615    REGEX_CHECK_STATUS_L(line);
3616    for (i=0; i<=matcher->groupCount(); i++) {
3617        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3618        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3619        if (matcher->start(i, status) != expectedStart) {
3620            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3621                line, i, expectedStart, matcher->start(i, status));
3622            failed = TRUE;
3623            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3624        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3625            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3626                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3627            failed = TRUE;
3628            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3629        }
3630
3631        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3632        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3633        if (matcher->end(i, status) != expectedEnd) {
3634            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3635                line, i, expectedEnd, matcher->end(i, status));
3636            failed = TRUE;
3637            // Error on end position;  keep going; real error is probably yet to come as group
3638            //   end positions work from end of the input data towards the front.
3639        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3640            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3641                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3642            failed = TRUE;
3643            // Error on end position;  keep going; real error is probably yet to come as group
3644            //   end positions work from end of the input data towards the front.
3645        }
3646    }
3647    if ( matcher->groupCount()+1 < groupStarts.size()) {
3648        errln("Error at line %d: Expected %d capture groups, found %d.",
3649            line, groupStarts.size()-1, matcher->groupCount());
3650        failed = TRUE;
3651        }
3652    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3653        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3654              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3655        failed = TRUE;
3656    }
3657
3658    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3659        matcher->requireEnd() == TRUE) {
3660        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3661        failed = TRUE;
3662    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3663        UTF8Matcher->requireEnd() == TRUE) {
3664        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3665        failed = TRUE;
3666    }
3667
3668    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3669        matcher->requireEnd() == FALSE) {
3670        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3671        failed = TRUE;
3672    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3673        UTF8Matcher->requireEnd() == FALSE) {
3674        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3675        failed = TRUE;
3676    }
3677
3678    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3679        matcher->hitEnd() == TRUE) {
3680        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3681        failed = TRUE;
3682    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3683               UTF8Matcher->hitEnd() == TRUE) {
3684        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3685        failed = TRUE;
3686    }
3687
3688    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3689        matcher->hitEnd() == FALSE) {
3690        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3691        failed = TRUE;
3692    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3693               UTF8Matcher->hitEnd() == FALSE) {
3694        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3695        failed = TRUE;
3696    }
3697
3698
3699cleanupAndReturn:
3700    if (failed) {
3701        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3702            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3703        // callerPattern->dump();
3704    }
3705    delete parseMatcher;
3706    delete parsePat;
3707    delete UTF8Matcher;
3708    delete UTF8Pattern;
3709    delete matcher;
3710    delete callerPattern;
3711
3712    utext_close(&inputText);
3713    delete[] inputChars;
3714    utext_close(&patternText);
3715    delete[] patternChars;
3716    ucnv_close(UTF8Converter);
3717}
3718
3719
3720
3721
3722//---------------------------------------------------------------------------
3723//
3724//      Errors     Check for error handling in patterns.
3725//
3726//---------------------------------------------------------------------------
3727void RegexTest::Errors() {
3728    // \escape sequences that aren't implemented yet.
3729    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3730
3731    // Missing close parentheses
3732    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3733    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3734    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3735
3736    // Extra close paren
3737    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3738    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3739    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3740
3741    // Look-ahead, Look-behind
3742    //  TODO:  add tests for unbounded length look-behinds.
3743    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3744
3745    // Attempt to use non-default flags
3746    {
3747        UParseError   pe;
3748        UErrorCode    status = U_ZERO_ERROR;
3749        int32_t       flags  = UREGEX_CANON_EQ |
3750                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3751                               UREGEX_MULTILINE;
3752        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3753        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3754        delete pat1;
3755    }
3756
3757
3758    // Quantifiers are allowed only after something that can be quantified.
3759    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3760    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3761    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3762
3763    // Mal-formed {min,max} quantifiers
3764    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3765    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3766    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3767    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3768    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3769    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3770    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3771    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3772    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3773
3774    // Ticket 5389
3775    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3776
3777    // Invalid Back Reference \0
3778    //    For ICU 3.8 and earlier
3779    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3780    //
3781    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3782
3783}
3784
3785
3786//-------------------------------------------------------------------------------
3787//
3788//  Read a text data file, convert it to UChars, and return the data
3789//    in one big UChar * buffer, which the caller must delete.
3790//
3791//--------------------------------------------------------------------------------
3792UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3793                                     const char *defEncoding, UErrorCode &status) {
3794    UChar       *retPtr  = NULL;
3795    char        *fileBuf = NULL;
3796    UConverter* conv     = NULL;
3797    FILE        *f       = NULL;
3798
3799    ulen = 0;
3800    if (U_FAILURE(status)) {
3801        return retPtr;
3802    }
3803
3804    //
3805    //  Open the file.
3806    //
3807    f = fopen(fileName, "rb");
3808    if (f == 0) {
3809        dataerrln("Error opening test data file %s\n", fileName);
3810        status = U_FILE_ACCESS_ERROR;
3811        return NULL;
3812    }
3813    //
3814    //  Read it in
3815    //
3816    int32_t            fileSize;
3817    int32_t            amt_read;
3818
3819    fseek( f, 0, SEEK_END);
3820    fileSize = ftell(f);
3821    fileBuf = new char[fileSize];
3822    fseek(f, 0, SEEK_SET);
3823    amt_read = fread(fileBuf, 1, fileSize, f);
3824    if (amt_read != fileSize || fileSize <= 0) {
3825        errln("Error reading test data file.");
3826        goto cleanUpAndReturn;
3827    }
3828
3829    //
3830    // Look for a Unicode Signature (BOM) on the data just read
3831    //
3832    int32_t        signatureLength;
3833    const char *   fileBufC;
3834    const char*    encoding;
3835
3836    fileBufC = fileBuf;
3837    encoding = ucnv_detectUnicodeSignature(
3838        fileBuf, fileSize, &signatureLength, &status);
3839    if(encoding!=NULL ){
3840        fileBufC  += signatureLength;
3841        fileSize  -= signatureLength;
3842    } else {
3843        encoding = defEncoding;
3844        if (strcmp(encoding, "utf-8") == 0) {
3845            errln("file %s is missing its BOM", fileName);
3846        }
3847    }
3848
3849    //
3850    // Open a converter to take the rule file to UTF-16
3851    //
3852    conv = ucnv_open(encoding, &status);
3853    if (U_FAILURE(status)) {
3854        goto cleanUpAndReturn;
3855    }
3856
3857    //
3858    // Convert the rules to UChar.
3859    //  Preflight first to determine required buffer size.
3860    //
3861    ulen = ucnv_toUChars(conv,
3862        NULL,           //  dest,
3863        0,              //  destCapacity,
3864        fileBufC,
3865        fileSize,
3866        &status);
3867    if (status == U_BUFFER_OVERFLOW_ERROR) {
3868        // Buffer Overflow is expected from the preflight operation.
3869        status = U_ZERO_ERROR;
3870
3871        retPtr = new UChar[ulen+1];
3872        ucnv_toUChars(conv,
3873            retPtr,       //  dest,
3874            ulen+1,
3875            fileBufC,
3876            fileSize,
3877            &status);
3878    }
3879
3880cleanUpAndReturn:
3881    fclose(f);
3882    delete[] fileBuf;
3883    ucnv_close(conv);
3884    if (U_FAILURE(status)) {
3885        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3886        delete []retPtr;
3887        retPtr = 0;
3888        ulen   = 0;
3889    };
3890    return retPtr;
3891}
3892
3893
3894//-------------------------------------------------------------------------------
3895//
3896//   PerlTests  - Run Perl's regular expression tests
3897//                The input file for this test is re_tests, the standard regular
3898//                expression test data distributed with the Perl source code.
3899//
3900//                Here is Perl's description of the test data file:
3901//
3902//        # The tests are in a separate file 't/op/re_tests'.
3903//        # Each line in that file is a separate test.
3904//        # There are five columns, separated by tabs.
3905//        #
3906//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3907//        # Modifiers can be put after the closing C<'>.
3908//        #
3909//        # Column 2 contains the string to be matched.
3910//        #
3911//        # Column 3 contains the expected result:
3912//        #     y   expect a match
3913//        #     n   expect no match
3914//        #     c   expect an error
3915//        # B   test exposes a known bug in Perl, should be skipped
3916//        # b   test exposes a known bug in Perl, should be skipped if noamp
3917//        #
3918//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3919//        #
3920//        # Column 4 contains a string, usually C<$&>.
3921//        #
3922//        # Column 5 contains the expected result of double-quote
3923//        # interpolating that string after the match, or start of error message.
3924//        #
3925//        # Column 6, if present, contains a reason why the test is skipped.
3926//        # This is printed with "skipped", for harness to pick up.
3927//        #
3928//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3929//        #
3930//        # If you want to add a regular expression test that can't be expressed
3931//        # in this format, don't add it here: put it in op/pat.t instead.
3932//
3933//        For ICU, if field 3 contains an 'i', the test will be skipped.
3934//        The test exposes is some known incompatibility between ICU and Perl regexps.
3935//        (The i is in addition to whatever was there before.)
3936//
3937//-------------------------------------------------------------------------------
3938void RegexTest::PerlTests() {
3939    char tdd[2048];
3940    const char *srcPath;
3941    UErrorCode  status = U_ZERO_ERROR;
3942    UParseError pe;
3943
3944    //
3945    //  Open and read the test data file.
3946    //
3947    srcPath=getPath(tdd, "re_tests.txt");
3948    if(srcPath==NULL) {
3949        return; /* something went wrong, error already output */
3950    }
3951
3952    int32_t    len;
3953    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3954    if (U_FAILURE(status)) {
3955        return; /* something went wrong, error already output */
3956    }
3957
3958    //
3959    //  Put the test data into a UnicodeString
3960    //
3961    UnicodeString testDataString(FALSE, testData, len);
3962
3963    //
3964    //  Regex to break the input file into lines, and strip the new lines.
3965    //     One line per match, capture group one is the desired data.
3966    //
3967    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3968    if (U_FAILURE(status)) {
3969        dataerrln("RegexPattern::compile() error");
3970        return;
3971    }
3972    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3973
3974    //
3975    //  Regex to split a test file line into fields.
3976    //    There are six fields, separated by tabs.
3977    //
3978    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3979
3980    //
3981    //  Regex to identify test patterns with flag settings, and to separate them.
3982    //    Test patterns with flags look like 'pattern'i
3983    //    Test patterns without flags are not quoted:   pattern
3984    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3985    //
3986    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3987    RegexMatcher* flagMat = flagPat->matcher(status);
3988
3989    //
3990    // The Perl tests reference several perl-isms, which are evaluated/substituted
3991    //   in the test data.  Not being perl, this must be done explicitly.  Here
3992    //   are string constants and REs for these constructs.
3993    //
3994    UnicodeString nulnulSrc("${nulnul}");
3995    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3996    nulnul = nulnul.unescape();
3997
3998    UnicodeString ffffSrc("${ffff}");
3999    UnicodeString ffff("\\uffff", -1, US_INV);
4000    ffff = ffff.unescape();
4001
4002    //  regexp for $-[0], $+[2], etc.
4003    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4004    RegexMatcher *groupsMat = groupsPat->matcher(status);
4005
4006    //  regexp for $0, $1, $2, etc.
4007    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4008    RegexMatcher *cgMat = cgPat->matcher(status);
4009
4010
4011    //
4012    // Main Loop for the Perl Tests, runs once per line from the
4013    //   test data file.
4014    //
4015    int32_t  lineNum = 0;
4016    int32_t  skippedUnimplementedCount = 0;
4017    while (lineMat->find()) {
4018        lineNum++;
4019
4020        //
4021        //  Get a line, break it into its fields, do the Perl
4022        //    variable substitutions.
4023        //
4024        UnicodeString line = lineMat->group(1, status);
4025        UnicodeString fields[7];
4026        fieldPat->split(line, fields, 7, status);
4027
4028        flagMat->reset(fields[0]);
4029        flagMat->matches(status);
4030        UnicodeString pattern  = flagMat->group(2, status);
4031        pattern.findAndReplace("${bang}", "!");
4032        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4033        pattern.findAndReplace(ffffSrc, ffff);
4034
4035        //
4036        //  Identify patterns that include match flag settings,
4037        //    split off the flags, remove the extra quotes.
4038        //
4039        UnicodeString flagStr = flagMat->group(3, status);
4040        if (U_FAILURE(status)) {
4041            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4042            return;
4043        }
4044        int32_t flags = 0;
4045        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4046        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4047        const UChar UChar_m = 0x6d;
4048        const UChar UChar_x = 0x78;
4049        const UChar UChar_y = 0x79;
4050        if (flagStr.indexOf(UChar_i) != -1) {
4051            flags |= UREGEX_CASE_INSENSITIVE;
4052        }
4053        if (flagStr.indexOf(UChar_m) != -1) {
4054            flags |= UREGEX_MULTILINE;
4055        }
4056        if (flagStr.indexOf(UChar_x) != -1) {
4057            flags |= UREGEX_COMMENTS;
4058        }
4059
4060        //
4061        // Compile the test pattern.
4062        //
4063        status = U_ZERO_ERROR;
4064        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4065        if (status == U_REGEX_UNIMPLEMENTED) {
4066            //
4067            // Test of a feature that is planned for ICU, but not yet implemented.
4068            //   skip the test.
4069            skippedUnimplementedCount++;
4070            delete testPat;
4071            status = U_ZERO_ERROR;
4072            continue;
4073        }
4074
4075        if (U_FAILURE(status)) {
4076            // Some tests are supposed to generate errors.
4077            //   Only report an error for tests that are supposed to succeed.
4078            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4079                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4080            {
4081                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4082            }
4083            status = U_ZERO_ERROR;
4084            delete testPat;
4085            continue;
4086        }
4087
4088        if (fields[2].indexOf(UChar_i) >= 0) {
4089            // ICU should skip this test.
4090            delete testPat;
4091            continue;
4092        }
4093
4094        if (fields[2].indexOf(UChar_c) >= 0) {
4095            // This pattern should have caused a compilation error, but didn't/
4096            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4097            delete testPat;
4098            continue;
4099        }
4100
4101        //
4102        // replace the Perl variables that appear in some of the
4103        //   match data strings.
4104        //
4105        UnicodeString matchString = fields[1];
4106        matchString.findAndReplace(nulnulSrc, nulnul);
4107        matchString.findAndReplace(ffffSrc,   ffff);
4108
4109        // Replace any \n in the match string with an actual new-line char.
4110        //  Don't do full unescape, as this unescapes more than Perl does, which
4111        //  causes other spurious failures in the tests.
4112        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4113
4114
4115
4116        //
4117        // Run the test, check for expected match/don't match result.
4118        //
4119        RegexMatcher *testMat = testPat->matcher(matchString, status);
4120        UBool found = testMat->find();
4121        UBool expected = FALSE;
4122        if (fields[2].indexOf(UChar_y) >=0) {
4123            expected = TRUE;
4124        }
4125        if (expected != found) {
4126            errln("line %d: Expected %smatch, got %smatch",
4127                lineNum, expected?"":"no ", found?"":"no " );
4128            continue;
4129        }
4130
4131        // Don't try to check expected results if there is no match.
4132        //   (Some have stuff in the expected fields)
4133        if (!found) {
4134            delete testMat;
4135            delete testPat;
4136            continue;
4137        }
4138
4139        //
4140        // Interpret the Perl expression from the fourth field of the data file,
4141        // building up an ICU string from the results of the ICU match.
4142        //   The Perl expression will contain references to the results of
4143        //     a regex match, including the matched string, capture group strings,
4144        //     group starting and ending indicies, etc.
4145        //
4146        UnicodeString resultString;
4147        UnicodeString perlExpr = fields[3];
4148#if SUPPORT_MUTATING_INPUT_STRING
4149        groupsMat->reset(perlExpr);
4150        cgMat->reset(perlExpr);
4151#endif
4152
4153        while (perlExpr.length() > 0) {
4154#if !SUPPORT_MUTATING_INPUT_STRING
4155            //  Perferred usage.  Reset after any modification to input string.
4156            groupsMat->reset(perlExpr);
4157            cgMat->reset(perlExpr);
4158#endif
4159
4160            if (perlExpr.startsWith("$&")) {
4161                resultString.append(testMat->group(status));
4162                perlExpr.remove(0, 2);
4163            }
4164
4165            else if (groupsMat->lookingAt(status)) {
4166                // $-[0]   $+[2]  etc.
4167                UnicodeString digitString = groupsMat->group(2, status);
4168                int32_t t = 0;
4169                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4170                UnicodeString plusOrMinus = groupsMat->group(1, status);
4171                int32_t matchPosition;
4172                if (plusOrMinus.compare("+") == 0) {
4173                    matchPosition = testMat->end(groupNum, status);
4174                } else {
4175                    matchPosition = testMat->start(groupNum, status);
4176                }
4177                if (matchPosition != -1) {
4178                    ICU_Utility::appendNumber(resultString, matchPosition);
4179                }
4180                perlExpr.remove(0, groupsMat->end(status));
4181            }
4182
4183            else if (cgMat->lookingAt(status)) {
4184                // $1, $2, $3, etc.
4185                UnicodeString digitString = cgMat->group(1, status);
4186                int32_t t = 0;
4187                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4188                if (U_SUCCESS(status)) {
4189                    resultString.append(testMat->group(groupNum, status));
4190                    status = U_ZERO_ERROR;
4191                }
4192                perlExpr.remove(0, cgMat->end(status));
4193            }
4194
4195            else if (perlExpr.startsWith("@-")) {
4196                int32_t i;
4197                for (i=0; i<=testMat->groupCount(); i++) {
4198                    if (i>0) {
4199                        resultString.append(" ");
4200                    }
4201                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4202                }
4203                perlExpr.remove(0, 2);
4204            }
4205
4206            else if (perlExpr.startsWith("@+")) {
4207                int32_t i;
4208                for (i=0; i<=testMat->groupCount(); i++) {
4209                    if (i>0) {
4210                        resultString.append(" ");
4211                    }
4212                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4213                }
4214                perlExpr.remove(0, 2);
4215            }
4216
4217            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4218                                                     //           or as an escaped sequence (e.g. \n)
4219                if (perlExpr.length() > 1) {
4220                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4221                }
4222                UChar c = perlExpr.charAt(0);
4223                switch (c) {
4224                case 'n':   c = '\n'; break;
4225                // add any other escape sequences that show up in the test expected results.
4226                }
4227                resultString.append(c);
4228                perlExpr.remove(0, 1);
4229            }
4230
4231            else  {
4232                // Any characters from the perl expression that we don't explicitly
4233                //  recognize before here are assumed to be literals and copied
4234                //  as-is to the expected results.
4235                resultString.append(perlExpr.charAt(0));
4236                perlExpr.remove(0, 1);
4237            }
4238
4239            if (U_FAILURE(status)) {
4240                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4241                break;
4242            }
4243        }
4244
4245        //
4246        // Expected Results Compare
4247        //
4248        UnicodeString expectedS(fields[4]);
4249        expectedS.findAndReplace(nulnulSrc, nulnul);
4250        expectedS.findAndReplace(ffffSrc,   ffff);
4251        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4252
4253
4254        if (expectedS.compare(resultString) != 0) {
4255            err("Line %d: Incorrect perl expression results.", lineNum);
4256            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4257        }
4258
4259        delete testMat;
4260        delete testPat;
4261    }
4262
4263    //
4264    // All done.  Clean up allocated stuff.
4265    //
4266    delete cgMat;
4267    delete cgPat;
4268
4269    delete groupsMat;
4270    delete groupsPat;
4271
4272    delete flagMat;
4273    delete flagPat;
4274
4275    delete lineMat;
4276    delete linePat;
4277
4278    delete fieldPat;
4279    delete [] testData;
4280
4281
4282    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4283
4284}
4285
4286
4287//-------------------------------------------------------------------------------
4288//
4289//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4290//                  (instead of using UnicodeStrings) to test the alternate engine.
4291//                  The input file for this test is re_tests, the standard regular
4292//                  expression test data distributed with the Perl source code.
4293//                  See PerlTests() for more information.
4294//
4295//-------------------------------------------------------------------------------
4296void RegexTest::PerlTestsUTF8() {
4297    char tdd[2048];
4298    const char *srcPath;
4299    UErrorCode  status = U_ZERO_ERROR;
4300    UParseError pe;
4301    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4302    UText       patternText = UTEXT_INITIALIZER;
4303    char       *patternChars = NULL;
4304    int32_t     patternLength;
4305    int32_t     patternCapacity = 0;
4306    UText       inputText = UTEXT_INITIALIZER;
4307    char       *inputChars = NULL;
4308    int32_t     inputLength;
4309    int32_t     inputCapacity = 0;
4310
4311    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4312
4313    //
4314    //  Open and read the test data file.
4315    //
4316    srcPath=getPath(tdd, "re_tests.txt");
4317    if(srcPath==NULL) {
4318        return; /* something went wrong, error already output */
4319    }
4320
4321    int32_t    len;
4322    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4323    if (U_FAILURE(status)) {
4324        return; /* something went wrong, error already output */
4325    }
4326
4327    //
4328    //  Put the test data into a UnicodeString
4329    //
4330    UnicodeString testDataString(FALSE, testData, len);
4331
4332    //
4333    //  Regex to break the input file into lines, and strip the new lines.
4334    //     One line per match, capture group one is the desired data.
4335    //
4336    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4337    if (U_FAILURE(status)) {
4338        dataerrln("RegexPattern::compile() error");
4339        return;
4340    }
4341    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4342
4343    //
4344    //  Regex to split a test file line into fields.
4345    //    There are six fields, separated by tabs.
4346    //
4347    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4348
4349    //
4350    //  Regex to identify test patterns with flag settings, and to separate them.
4351    //    Test patterns with flags look like 'pattern'i
4352    //    Test patterns without flags are not quoted:   pattern
4353    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4354    //
4355    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4356    RegexMatcher* flagMat = flagPat->matcher(status);
4357
4358    //
4359    // The Perl tests reference several perl-isms, which are evaluated/substituted
4360    //   in the test data.  Not being perl, this must be done explicitly.  Here
4361    //   are string constants and REs for these constructs.
4362    //
4363    UnicodeString nulnulSrc("${nulnul}");
4364    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4365    nulnul = nulnul.unescape();
4366
4367    UnicodeString ffffSrc("${ffff}");
4368    UnicodeString ffff("\\uffff", -1, US_INV);
4369    ffff = ffff.unescape();
4370
4371    //  regexp for $-[0], $+[2], etc.
4372    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4373    RegexMatcher *groupsMat = groupsPat->matcher(status);
4374
4375    //  regexp for $0, $1, $2, etc.
4376    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4377    RegexMatcher *cgMat = cgPat->matcher(status);
4378
4379
4380    //
4381    // Main Loop for the Perl Tests, runs once per line from the
4382    //   test data file.
4383    //
4384    int32_t  lineNum = 0;
4385    int32_t  skippedUnimplementedCount = 0;
4386    while (lineMat->find()) {
4387        lineNum++;
4388
4389        //
4390        //  Get a line, break it into its fields, do the Perl
4391        //    variable substitutions.
4392        //
4393        UnicodeString line = lineMat->group(1, status);
4394        UnicodeString fields[7];
4395        fieldPat->split(line, fields, 7, status);
4396
4397        flagMat->reset(fields[0]);
4398        flagMat->matches(status);
4399        UnicodeString pattern  = flagMat->group(2, status);
4400        pattern.findAndReplace("${bang}", "!");
4401        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4402        pattern.findAndReplace(ffffSrc, ffff);
4403
4404        //
4405        //  Identify patterns that include match flag settings,
4406        //    split off the flags, remove the extra quotes.
4407        //
4408        UnicodeString flagStr = flagMat->group(3, status);
4409        if (U_FAILURE(status)) {
4410            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4411            return;
4412        }
4413        int32_t flags = 0;
4414        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4415        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4416        const UChar UChar_m = 0x6d;
4417        const UChar UChar_x = 0x78;
4418        const UChar UChar_y = 0x79;
4419        if (flagStr.indexOf(UChar_i) != -1) {
4420            flags |= UREGEX_CASE_INSENSITIVE;
4421        }
4422        if (flagStr.indexOf(UChar_m) != -1) {
4423            flags |= UREGEX_MULTILINE;
4424        }
4425        if (flagStr.indexOf(UChar_x) != -1) {
4426            flags |= UREGEX_COMMENTS;
4427        }
4428
4429        //
4430        // Put the pattern in a UTF-8 UText
4431        //
4432        status = U_ZERO_ERROR;
4433        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4434        if (status == U_BUFFER_OVERFLOW_ERROR) {
4435            status = U_ZERO_ERROR;
4436            delete[] patternChars;
4437            patternCapacity = patternLength + 1;
4438            patternChars = new char[patternCapacity];
4439            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4440        }
4441        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4442
4443        //
4444        // Compile the test pattern.
4445        //
4446        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4447        if (status == U_REGEX_UNIMPLEMENTED) {
4448            //
4449            // Test of a feature that is planned for ICU, but not yet implemented.
4450            //   skip the test.
4451            skippedUnimplementedCount++;
4452            delete testPat;
4453            status = U_ZERO_ERROR;
4454            continue;
4455        }
4456
4457        if (U_FAILURE(status)) {
4458            // Some tests are supposed to generate errors.
4459            //   Only report an error for tests that are supposed to succeed.
4460            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4461                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4462            {
4463                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4464            }
4465            status = U_ZERO_ERROR;
4466            delete testPat;
4467            continue;
4468        }
4469
4470        if (fields[2].indexOf(UChar_i) >= 0) {
4471            // ICU should skip this test.
4472            delete testPat;
4473            continue;
4474        }
4475
4476        if (fields[2].indexOf(UChar_c) >= 0) {
4477            // This pattern should have caused a compilation error, but didn't/
4478            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4479            delete testPat;
4480            continue;
4481        }
4482
4483
4484        //
4485        // replace the Perl variables that appear in some of the
4486        //   match data strings.
4487        //
4488        UnicodeString matchString = fields[1];
4489        matchString.findAndReplace(nulnulSrc, nulnul);
4490        matchString.findAndReplace(ffffSrc,   ffff);
4491
4492        // Replace any \n in the match string with an actual new-line char.
4493        //  Don't do full unescape, as this unescapes more than Perl does, which
4494        //  causes other spurious failures in the tests.
4495        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4496
4497        //
4498        // Put the input in a UTF-8 UText
4499        //
4500        status = U_ZERO_ERROR;
4501        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4502        if (status == U_BUFFER_OVERFLOW_ERROR) {
4503            status = U_ZERO_ERROR;
4504            delete[] inputChars;
4505            inputCapacity = inputLength + 1;
4506            inputChars = new char[inputCapacity];
4507            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4508        }
4509        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4510
4511        //
4512        // Run the test, check for expected match/don't match result.
4513        //
4514        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4515        UBool found = testMat->find();
4516        UBool expected = FALSE;
4517        if (fields[2].indexOf(UChar_y) >=0) {
4518            expected = TRUE;
4519        }
4520        if (expected != found) {
4521            errln("line %d: Expected %smatch, got %smatch",
4522                lineNum, expected?"":"no ", found?"":"no " );
4523            continue;
4524        }
4525
4526        // Don't try to check expected results if there is no match.
4527        //   (Some have stuff in the expected fields)
4528        if (!found) {
4529            delete testMat;
4530            delete testPat;
4531            continue;
4532        }
4533
4534        //
4535        // Interpret the Perl expression from the fourth field of the data file,
4536        // building up an ICU string from the results of the ICU match.
4537        //   The Perl expression will contain references to the results of
4538        //     a regex match, including the matched string, capture group strings,
4539        //     group starting and ending indicies, etc.
4540        //
4541        UnicodeString resultString;
4542        UnicodeString perlExpr = fields[3];
4543
4544        while (perlExpr.length() > 0) {
4545            groupsMat->reset(perlExpr);
4546            cgMat->reset(perlExpr);
4547
4548            if (perlExpr.startsWith("$&")) {
4549                resultString.append(testMat->group(status));
4550                perlExpr.remove(0, 2);
4551            }
4552
4553            else if (groupsMat->lookingAt(status)) {
4554                // $-[0]   $+[2]  etc.
4555                UnicodeString digitString = groupsMat->group(2, status);
4556                int32_t t = 0;
4557                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4558                UnicodeString plusOrMinus = groupsMat->group(1, status);
4559                int32_t matchPosition;
4560                if (plusOrMinus.compare("+") == 0) {
4561                    matchPosition = testMat->end(groupNum, status);
4562                } else {
4563                    matchPosition = testMat->start(groupNum, status);
4564                }
4565                if (matchPosition != -1) {
4566                    ICU_Utility::appendNumber(resultString, matchPosition);
4567                }
4568                perlExpr.remove(0, groupsMat->end(status));
4569            }
4570
4571            else if (cgMat->lookingAt(status)) {
4572                // $1, $2, $3, etc.
4573                UnicodeString digitString = cgMat->group(1, status);
4574                int32_t t = 0;
4575                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4576                if (U_SUCCESS(status)) {
4577                    resultString.append(testMat->group(groupNum, status));
4578                    status = U_ZERO_ERROR;
4579                }
4580                perlExpr.remove(0, cgMat->end(status));
4581            }
4582
4583            else if (perlExpr.startsWith("@-")) {
4584                int32_t i;
4585                for (i=0; i<=testMat->groupCount(); i++) {
4586                    if (i>0) {
4587                        resultString.append(" ");
4588                    }
4589                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4590                }
4591                perlExpr.remove(0, 2);
4592            }
4593
4594            else if (perlExpr.startsWith("@+")) {
4595                int32_t i;
4596                for (i=0; i<=testMat->groupCount(); i++) {
4597                    if (i>0) {
4598                        resultString.append(" ");
4599                    }
4600                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4601                }
4602                perlExpr.remove(0, 2);
4603            }
4604
4605            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4606                                                     //           or as an escaped sequence (e.g. \n)
4607                if (perlExpr.length() > 1) {
4608                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4609                }
4610                UChar c = perlExpr.charAt(0);
4611                switch (c) {
4612                case 'n':   c = '\n'; break;
4613                // add any other escape sequences that show up in the test expected results.
4614                }
4615                resultString.append(c);
4616                perlExpr.remove(0, 1);
4617            }
4618
4619            else  {
4620                // Any characters from the perl expression that we don't explicitly
4621                //  recognize before here are assumed to be literals and copied
4622                //  as-is to the expected results.
4623                resultString.append(perlExpr.charAt(0));
4624                perlExpr.remove(0, 1);
4625            }
4626
4627            if (U_FAILURE(status)) {
4628                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4629                break;
4630            }
4631        }
4632
4633        //
4634        // Expected Results Compare
4635        //
4636        UnicodeString expectedS(fields[4]);
4637        expectedS.findAndReplace(nulnulSrc, nulnul);
4638        expectedS.findAndReplace(ffffSrc,   ffff);
4639        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4640
4641
4642        if (expectedS.compare(resultString) != 0) {
4643            err("Line %d: Incorrect perl expression results.", lineNum);
4644            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4645        }
4646
4647        delete testMat;
4648        delete testPat;
4649    }
4650
4651    //
4652    // All done.  Clean up allocated stuff.
4653    //
4654    delete cgMat;
4655    delete cgPat;
4656
4657    delete groupsMat;
4658    delete groupsPat;
4659
4660    delete flagMat;
4661    delete flagPat;
4662
4663    delete lineMat;
4664    delete linePat;
4665
4666    delete fieldPat;
4667    delete [] testData;
4668
4669    utext_close(&patternText);
4670    utext_close(&inputText);
4671
4672    delete [] patternChars;
4673    delete [] inputChars;
4674
4675
4676    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4677
4678}
4679
4680
4681//--------------------------------------------------------------
4682//
4683//  Bug6149   Verify limits to heap expansion for backtrack stack.
4684//             Use this pattern,
4685//                 "(a?){1,}"
4686//             The zero-length match will repeat forever.
4687//                (That this goes into a loop is another bug)
4688//
4689//---------------------------------------------------------------
4690void RegexTest::Bug6149() {
4691    UnicodeString pattern("(a?){1,}");
4692    UnicodeString s("xyz");
4693    uint32_t flags = 0;
4694    UErrorCode status = U_ZERO_ERROR;
4695
4696    RegexMatcher  matcher(pattern, s, flags, status);
4697    UBool result = false;
4698    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4699    REGEX_ASSERT(result == FALSE);
4700 }
4701
4702
4703//
4704//   Callbacks()    Test the callback function.
4705//                  When set, callbacks occur periodically during matching operations,
4706//                  giving the application code the ability to abort the operation
4707//                  before it's normal completion.
4708//
4709
4710struct callBackContext {
4711    RegexTest        *test;
4712    int32_t          maxCalls;
4713    int32_t          numCalls;
4714    int32_t          lastSteps;
4715    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4716};
4717
4718U_CDECL_BEGIN
4719static UBool U_CALLCONV
4720testCallBackFn(const void *context, int32_t steps) {
4721    callBackContext  *info = (callBackContext *)context;
4722    if (info->lastSteps+1 != steps) {
4723        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4724    }
4725    info->lastSteps = steps;
4726    info->numCalls++;
4727    return (info->numCalls < info->maxCalls);
4728}
4729U_CDECL_END
4730
4731void RegexTest::Callbacks() {
4732   {
4733        // Getter returns NULLs if no callback has been set
4734
4735        //   The variables that the getter will fill in.
4736        //   Init to non-null values so that the action of the getter can be seen.
4737        const void          *returnedContext = &returnedContext;
4738        URegexMatchCallback *returnedFn = &testCallBackFn;
4739
4740        UErrorCode status = U_ZERO_ERROR;
4741        RegexMatcher matcher("x", 0, status);
4742        REGEX_CHECK_STATUS;
4743        matcher.getMatchCallback(returnedFn, returnedContext, status);
4744        REGEX_CHECK_STATUS;
4745        REGEX_ASSERT(returnedFn == NULL);
4746        REGEX_ASSERT(returnedContext == NULL);
4747    }
4748
4749   {
4750        // Set and Get work
4751        callBackContext cbInfo = {this, 0, 0, 0};
4752        const void          *returnedContext;
4753        URegexMatchCallback *returnedFn;
4754        UErrorCode status = U_ZERO_ERROR;
4755        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4756        REGEX_CHECK_STATUS;
4757        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4758        REGEX_CHECK_STATUS;
4759        matcher.getMatchCallback(returnedFn, returnedContext, status);
4760        REGEX_CHECK_STATUS;
4761        REGEX_ASSERT(returnedFn == testCallBackFn);
4762        REGEX_ASSERT(returnedContext == &cbInfo);
4763
4764        // A short-running match shouldn't invoke the callback
4765        status = U_ZERO_ERROR;
4766        cbInfo.reset(1);
4767        UnicodeString s = "xxx";
4768        matcher.reset(s);
4769        REGEX_ASSERT(matcher.matches(status));
4770        REGEX_CHECK_STATUS;
4771        REGEX_ASSERT(cbInfo.numCalls == 0);
4772
4773        // A medium-length match that runs long enough to invoke the
4774        //   callback, but not so long that the callback aborts it.
4775        status = U_ZERO_ERROR;
4776        cbInfo.reset(4);
4777        s = "aaaaaaaaaaaaaaaaaaab";
4778        matcher.reset(s);
4779        REGEX_ASSERT(matcher.matches(status)==FALSE);
4780        REGEX_CHECK_STATUS;
4781        REGEX_ASSERT(cbInfo.numCalls > 0);
4782
4783        // A longer running match that the callback function will abort.
4784        status = U_ZERO_ERROR;
4785        cbInfo.reset(4);
4786        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4787        matcher.reset(s);
4788        REGEX_ASSERT(matcher.matches(status)==FALSE);
4789        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4790        REGEX_ASSERT(cbInfo.numCalls == 4);
4791    }
4792
4793
4794}
4795
4796
4797//
4798//   FindProgressCallbacks()    Test the find "progress" callback function.
4799//                  When set, the find progress callback will be invoked during a find operations
4800//                  after each return from a match attempt, giving the application the opportunity
4801//                  to terminate a long-running find operation before it's normal completion.
4802//
4803
4804struct progressCallBackContext {
4805    RegexTest        *test;
4806    int64_t          lastIndex;
4807    int32_t          maxCalls;
4808    int32_t          numCalls;
4809    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4810};
4811
4812U_CDECL_BEGIN
4813static UBool U_CALLCONV
4814testProgressCallBackFn(const void *context, int64_t matchIndex) {
4815    progressCallBackContext  *info = (progressCallBackContext *)context;
4816    info->numCalls++;
4817    info->lastIndex = matchIndex;
4818//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4819    return (info->numCalls < info->maxCalls);
4820}
4821U_CDECL_END
4822
4823void RegexTest::FindProgressCallbacks() {
4824   {
4825        // Getter returns NULLs if no callback has been set
4826
4827        //   The variables that the getter will fill in.
4828        //   Init to non-null values so that the action of the getter can be seen.
4829        const void                  *returnedContext = &returnedContext;
4830        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4831
4832        UErrorCode status = U_ZERO_ERROR;
4833        RegexMatcher matcher("x", 0, status);
4834        REGEX_CHECK_STATUS;
4835        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4836        REGEX_CHECK_STATUS;
4837        REGEX_ASSERT(returnedFn == NULL);
4838        REGEX_ASSERT(returnedContext == NULL);
4839    }
4840
4841   {
4842        // Set and Get work
4843        progressCallBackContext cbInfo = {this, 0, 0, 0};
4844        const void                  *returnedContext;
4845        URegexFindProgressCallback  *returnedFn;
4846        UErrorCode status = U_ZERO_ERROR;
4847        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4848        REGEX_CHECK_STATUS;
4849        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4850        REGEX_CHECK_STATUS;
4851        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4852        REGEX_CHECK_STATUS;
4853        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4854        REGEX_ASSERT(returnedContext == &cbInfo);
4855
4856        // A short-running match should NOT invoke the callback.
4857        status = U_ZERO_ERROR;
4858        cbInfo.reset(100);
4859        UnicodeString s = "abxxx";
4860        matcher.reset(s);
4861#if 0
4862        matcher.setTrace(TRUE);
4863#endif
4864        REGEX_ASSERT(matcher.find(0, status));
4865        REGEX_CHECK_STATUS;
4866        REGEX_ASSERT(cbInfo.numCalls == 0);
4867
4868        // A medium running match that causes matcher.find() to invoke our callback for each index.
4869        status = U_ZERO_ERROR;
4870        s = "aaaaaaaaaaaaaaaaaaab";
4871        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4872        matcher.reset(s);
4873        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4874        REGEX_CHECK_STATUS;
4875        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4876
4877        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4878        status = U_ZERO_ERROR;
4879        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4880        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4881        matcher.reset(s1);
4882        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4883        REGEX_CHECK_STATUS;
4884        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4885
4886#if 0
4887        // Now a match that will succeed, but after an interruption
4888        status = U_ZERO_ERROR;
4889        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4890        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4891        matcher.reset(s2);
4892        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4893        REGEX_CHECK_STATUS;
4894        // Now retry the match from where left off
4895        cbInfo.maxCalls = 100; //  No callback limit
4896        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4897        REGEX_CHECK_STATUS;
4898#endif
4899    }
4900
4901
4902}
4903
4904
4905//---------------------------------------------------------------------------
4906//
4907//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4908//                             UTexts. The pure-C implementation of UText
4909//                             has no mutable backing stores, but we can
4910//                             use UnicodeString here to test the functionality.
4911//
4912//---------------------------------------------------------------------------
4913void RegexTest::PreAllocatedUTextCAPI () {
4914    UErrorCode           status = U_ZERO_ERROR;
4915    URegularExpression  *re;
4916    UText                patternText = UTEXT_INITIALIZER;
4917    UnicodeString        buffer;
4918    UText                bufferText = UTEXT_INITIALIZER;
4919
4920    utext_openUnicodeString(&bufferText, &buffer, &status);
4921
4922    /*
4923     *  getText() and getUText()
4924     */
4925    {
4926        UText  text1 = UTEXT_INITIALIZER;
4927        UText  text2 = UTEXT_INITIALIZER;
4928        UChar  text2Chars[20];
4929        UText  *resultText;
4930
4931        status = U_ZERO_ERROR;
4932        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4933        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4934        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4935        utext_openUChars(&text2, text2Chars, -1, &status);
4936
4937        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4938        re = uregex_openUText(&patternText, 0, NULL, &status);
4939
4940        /* First set a UText */
4941        uregex_setUText(re, &text1, &status);
4942        resultText = uregex_getUText(re, &bufferText, &status);
4943        REGEX_CHECK_STATUS;
4944        REGEX_ASSERT(resultText == &bufferText);
4945        utext_setNativeIndex(resultText, 0);
4946        utext_setNativeIndex(&text1, 0);
4947        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4948
4949        resultText = uregex_getUText(re, &bufferText, &status);
4950        REGEX_CHECK_STATUS;
4951        REGEX_ASSERT(resultText == &bufferText);
4952        utext_setNativeIndex(resultText, 0);
4953        utext_setNativeIndex(&text1, 0);
4954        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4955
4956        /* Then set a UChar * */
4957        uregex_setText(re, text2Chars, 7, &status);
4958        resultText = uregex_getUText(re, &bufferText, &status);
4959        REGEX_CHECK_STATUS;
4960        REGEX_ASSERT(resultText == &bufferText);
4961        utext_setNativeIndex(resultText, 0);
4962        utext_setNativeIndex(&text2, 0);
4963        REGEX_ASSERT(testUTextEqual(resultText, &text2));
4964
4965        uregex_close(re);
4966        utext_close(&text1);
4967        utext_close(&text2);
4968    }
4969
4970    /*
4971     *  group()
4972     */
4973    {
4974        UChar    text1[80];
4975        UText   *actual;
4976        UBool    result;
4977        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4978
4979        status = U_ZERO_ERROR;
4980        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4981        REGEX_CHECK_STATUS;
4982
4983        uregex_setText(re, text1, -1, &status);
4984        result = uregex_find(re, 0, &status);
4985        REGEX_ASSERT(result==TRUE);
4986
4987        /*  Capture Group 0, the full match.  Should succeed.  */
4988        status = U_ZERO_ERROR;
4989        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4990        REGEX_CHECK_STATUS;
4991        REGEX_ASSERT(actual == &bufferText);
4992        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4993
4994        /*  Capture group #1.  Should succeed. */
4995        status = U_ZERO_ERROR;
4996        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4997        REGEX_CHECK_STATUS;
4998        REGEX_ASSERT(actual == &bufferText);
4999        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5000
5001        /*  Capture group out of range.  Error. */
5002        status = U_ZERO_ERROR;
5003        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5004        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5005        REGEX_ASSERT(actual == &bufferText);
5006
5007        uregex_close(re);
5008
5009    }
5010
5011    /*
5012     *  replaceFirst()
5013     */
5014    {
5015        UChar    text1[80];
5016        UChar    text2[80];
5017        UText    replText = UTEXT_INITIALIZER;
5018        UText   *result;
5019
5020        status = U_ZERO_ERROR;
5021        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5022        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5023        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5024
5025        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5026        REGEX_CHECK_STATUS;
5027
5028        /*  Normal case, with match */
5029        uregex_setText(re, text1, -1, &status);
5030        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5031        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5032        REGEX_CHECK_STATUS;
5033        REGEX_ASSERT(result == &bufferText);
5034        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5035
5036        /* No match.  Text should copy to output with no changes.  */
5037        uregex_setText(re, text2, -1, &status);
5038        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5039        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5040        REGEX_CHECK_STATUS;
5041        REGEX_ASSERT(result == &bufferText);
5042        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5043
5044        /* Unicode escapes */
5045        uregex_setText(re, text1, -1, &status);
5046        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5047        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5048        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5049        REGEX_CHECK_STATUS;
5050        REGEX_ASSERT(result == &bufferText);
5051        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5052
5053        uregex_close(re);
5054        utext_close(&replText);
5055    }
5056
5057
5058    /*
5059     *  replaceAll()
5060     */
5061    {
5062        UChar    text1[80];
5063        UChar    text2[80];
5064        UText    replText = UTEXT_INITIALIZER;
5065        UText   *result;
5066
5067        status = U_ZERO_ERROR;
5068        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5069        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5070        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5071
5072        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5073        REGEX_CHECK_STATUS;
5074
5075        /*  Normal case, with match */
5076        uregex_setText(re, text1, -1, &status);
5077        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5078        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5079        REGEX_CHECK_STATUS;
5080        REGEX_ASSERT(result == &bufferText);
5081        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5082
5083        /* No match.  Text should copy to output with no changes.  */
5084        uregex_setText(re, text2, -1, &status);
5085        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5086        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5087        REGEX_CHECK_STATUS;
5088        REGEX_ASSERT(result == &bufferText);
5089        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5090
5091        uregex_close(re);
5092        utext_close(&replText);
5093    }
5094
5095
5096    /*
5097     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5098     *   so we don't need to test it here.
5099     */
5100
5101    utext_close(&bufferText);
5102    utext_close(&patternText);
5103}
5104
5105//--------------------------------------------------------------
5106//
5107//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5108//
5109//---------------------------------------------------------------
5110void RegexTest::Bug7651() {
5111    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5112    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5113    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5114    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5115    UnicodeString s("#ff @abcd This is test");
5116    RegexPattern  *REPattern = NULL;
5117    RegexMatcher  *REMatcher = NULL;
5118    UErrorCode status = U_ZERO_ERROR;
5119    UParseError pe;
5120
5121    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5122    REGEX_CHECK_STATUS;
5123    REMatcher = REPattern->matcher(s, status);
5124    REGEX_CHECK_STATUS;
5125    REGEX_ASSERT(REMatcher->find());
5126    REGEX_ASSERT(REMatcher->start(status) == 0);
5127    delete REPattern;
5128    delete REMatcher;
5129    status = U_ZERO_ERROR;
5130
5131    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5132    REGEX_CHECK_STATUS;
5133    REMatcher = REPattern->matcher(s, status);
5134    REGEX_CHECK_STATUS;
5135    REGEX_ASSERT(REMatcher->find());
5136    REGEX_ASSERT(REMatcher->start(status) == 0);
5137    delete REPattern;
5138    delete REMatcher;
5139    status = U_ZERO_ERROR;
5140 }
5141
5142void RegexTest::Bug7740() {
5143    UErrorCode status = U_ZERO_ERROR;
5144    UnicodeString pattern = "(a)";
5145    UnicodeString text = "abcdef";
5146    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5147    REGEX_CHECK_STATUS;
5148    REGEX_ASSERT(m->lookingAt(status));
5149    REGEX_CHECK_STATUS;
5150    status = U_ILLEGAL_ARGUMENT_ERROR;
5151    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5152    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5153    REGEX_ASSERT(s == "");
5154    delete m;
5155}
5156
5157// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5158
5159void RegexTest::Bug8479() {
5160    UErrorCode status = U_ZERO_ERROR;
5161
5162    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5163    REGEX_CHECK_STATUS;
5164    if (U_SUCCESS(status))
5165    {
5166        UnicodeString str;
5167        str.setToBogus();
5168        pMatcher->reset(str);
5169        status = U_ZERO_ERROR;
5170        pMatcher->matches(status);
5171        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5172        delete pMatcher;
5173    }
5174}
5175
5176
5177// Bug 7029
5178void RegexTest::Bug7029() {
5179    UErrorCode status = U_ZERO_ERROR;
5180
5181    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5182    UnicodeString text = "abc.def";
5183    UnicodeString splits[10];
5184    REGEX_CHECK_STATUS;
5185    int32_t numFields = pMatcher->split(text, splits, 10, status);
5186    REGEX_CHECK_STATUS;
5187    REGEX_ASSERT(numFields == 8);
5188    delete pMatcher;
5189}
5190
5191// Bug 9283
5192//   This test is checking for the existance of any supplemental characters that case-fold
5193//   to a bmp character.
5194//
5195//   At the time of this writing there are none. If any should appear in a subsequent release
5196//   of Unicode, the code in regular expressions compilation that determines the longest
5197//   posssible match for a literal string  will need to be enhanced.
5198//
5199//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5200//   for details on what to do in case of a failure of this test.
5201//
5202void RegexTest::Bug9283() {
5203    UErrorCode status = U_ZERO_ERROR;
5204    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5205    REGEX_CHECK_STATUS;
5206    int32_t index;
5207    UChar32 c;
5208    for (index=0; ; index++) {
5209        c = supplementalsWithCaseFolding.charAt(index);
5210        if (c == -1) {
5211            break;
5212        }
5213        UnicodeString cf = UnicodeString(c).foldCase();
5214        REGEX_ASSERT(cf.length() >= 2);
5215    }
5216}
5217
5218
5219void RegexTest::CheckInvBufSize() {
5220  if(inv_next>=INV_BUFSIZ) {
5221    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5222          __FILE__, INV_BUFSIZ, inv_next);
5223  } else {
5224    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5225  }
5226}
5227
5228#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5229
5230