1/*
2**************************************************************************
3*   Copyright (C) 2002-2012 International Business Machines Corporation  *
4*   and others. All rights reserved.                                     *
5**************************************************************************
6*/
7//
8//  file:  rematch.cpp
9//
10//         Contains the implementation of class RegexMatcher,
11//         which is one of the main API classes for the ICU regular expression package.
12//
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "unicode/rbbi.h"
22#include "unicode/utf.h"
23#include "unicode/utf16.h"
24#include "uassert.h"
25#include "cmemory.h"
26#include "uvector.h"
27#include "uvectr32.h"
28#include "uvectr64.h"
29#include "regeximp.h"
30#include "regexst.h"
31#include "regextxt.h"
32#include "ucase.h"
33
34// #include <malloc.h>        // Needed for heapcheck testing
35
36
37// Find progress callback
38// ----------------------
39// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
40//
41#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
42    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43
44
45// Smart Backtracking
46// ------------------
47// When a failure would go back to a LOOP_C instruction,
48// strings, characters, and setrefs scan backwards for a valid start
49// character themselves, pop the stack, and save state, emulating the
50// LOOP_C's effect but assured that the next character of input is a
51// possible matching character.
52//
53// Good idea in theory; unfortunately it only helps out a few specific
54// cases and slows the engine down a little in the rest.
55
56U_NAMESPACE_BEGIN
57
58// Default limit for the size of the back track stack, to avoid system
59//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61//    which use recursion rather than the heap, and take more storage per
62//    backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67//   Time limits for expression evaluation are in terms of quanta of work by
68//   the engine, each of which is 10,000 state saves.
69//   This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72//-----------------------------------------------------------------------------
73//
74//   Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
77RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78    fDeferredStatus = U_ZERO_ERROR;
79    init(fDeferredStatus);
80    if (U_FAILURE(fDeferredStatus)) {
81        return;
82    }
83    if (pat==NULL) {
84        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85        return;
86    }
87    fPattern = pat;
88    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                           uint32_t flags, UErrorCode &status) {
95    init(status);
96    if (U_FAILURE(status)) {
97        return;
98    }
99    UParseError    pe;
100    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101    fPattern           = fPatternOwned;
102
103    UText inputText = UTEXT_INITIALIZER;
104    utext_openConstUnicodeString(&inputText, &input, &status);
105    init2(&inputText, status);
106    utext_close(&inputText);
107
108    fInputUniStrMaybeMutable = TRUE;
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                           uint32_t flags, UErrorCode &status) {
114    init(status);
115    if (U_FAILURE(status)) {
116        return;
117    }
118    UParseError    pe;
119    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120    if (U_FAILURE(status)) {
121        return;
122    }
123
124    fPattern           = fPatternOwned;
125    init2(input, status);
126}
127
128
129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                           uint32_t flags, UErrorCode &status) {
131    init(status);
132    if (U_FAILURE(status)) {
133        return;
134    }
135    UParseError    pe;
136    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137    if (U_FAILURE(status)) {
138        return;
139    }
140    fPattern           = fPatternOwned;
141    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
144RegexMatcher::RegexMatcher(UText *regexp,
145                           uint32_t flags, UErrorCode &status) {
146    init(status);
147    if (U_FAILURE(status)) {
148        return;
149    }
150    UParseError    pe;
151    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152        if (U_FAILURE(status)) {
153        return;
154    }
155
156    fPattern           = fPatternOwned;
157    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158}
159
160
161
162
163RegexMatcher::~RegexMatcher() {
164    delete fStack;
165    if (fData != fSmallData) {
166        uprv_free(fData);
167        fData = NULL;
168    }
169    if (fPatternOwned) {
170        delete fPatternOwned;
171        fPatternOwned = NULL;
172        fPattern = NULL;
173    }
174
175    if (fInput) {
176        delete fInput;
177    }
178    if (fInputText) {
179        utext_close(fInputText);
180    }
181    if (fAltInputText) {
182        utext_close(fAltInputText);
183    }
184
185    #if UCONFIG_NO_BREAK_ITERATION==0
186    delete fWordBreakItr;
187    #endif
188}
189
190//
191//   init()   common initialization for use by all constructors.
192//            Initialize all fields, get the object into a consistent state.
193//            This must be done even when the initial status shows an error,
194//            so that the object is initialized sufficiently well for the destructor
195//            to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198    fPattern           = NULL;
199    fPatternOwned      = NULL;
200    fFrameSize         = 0;
201    fRegionStart       = 0;
202    fRegionLimit       = 0;
203    fAnchorStart       = 0;
204    fAnchorLimit       = 0;
205    fLookStart         = 0;
206    fLookLimit         = 0;
207    fActiveStart       = 0;
208    fActiveLimit       = 0;
209    fTransparentBounds = FALSE;
210    fAnchoringBounds   = TRUE;
211    fMatch             = FALSE;
212    fMatchStart        = 0;
213    fMatchEnd          = 0;
214    fLastMatchEnd      = -1;
215    fAppendPosition    = 0;
216    fHitEnd            = FALSE;
217    fRequireEnd        = FALSE;
218    fStack             = NULL;
219    fFrame             = NULL;
220    fTimeLimit         = 0;
221    fTime              = 0;
222    fTickCounter       = 0;
223    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224    fCallbackFn        = NULL;
225    fCallbackContext   = NULL;
226    fFindProgressCallbackFn      = NULL;
227    fFindProgressCallbackContext = NULL;
228    fTraceDebug        = FALSE;
229    fDeferredStatus    = status;
230    fData              = fSmallData;
231    fWordBreakItr      = NULL;
232
233    fStack             = NULL;
234    fInputText         = NULL;
235    fAltInputText      = NULL;
236    fInput             = NULL;
237    fInputLength       = 0;
238    fInputUniStrMaybeMutable = FALSE;
239
240    if (U_FAILURE(status)) {
241        fDeferredStatus = status;
242    }
243}
244
245//
246//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247//            This handles the common setup to be done after the Pattern is available.
248//
249void RegexMatcher::init2(UText *input, UErrorCode &status) {
250    if (U_FAILURE(status)) {
251        fDeferredStatus = status;
252        return;
253    }
254
255    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257        if (fData == NULL) {
258            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259            return;
260        }
261    }
262
263    fStack = new UVector64(status);
264    if (fStack == NULL) {
265        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266        return;
267    }
268
269    reset(input);
270    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271    if (U_FAILURE(status)) {
272        fDeferredStatus = status;
273        return;
274    }
275}
276
277
278static const UChar BACKSLASH  = 0x5c;
279static const UChar DOLLARSIGN = 0x24;
280//--------------------------------------------------------------------------------
281//
282//    appendReplacement
283//
284//--------------------------------------------------------------------------------
285RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286                                              const UnicodeString &replacement,
287                                              UErrorCode &status) {
288    UText replacementText = UTEXT_INITIALIZER;
289
290    utext_openConstUnicodeString(&replacementText, &replacement, &status);
291    if (U_SUCCESS(status)) {
292        UText resultText = UTEXT_INITIALIZER;
293        utext_openUnicodeString(&resultText, &dest, &status);
294
295        if (U_SUCCESS(status)) {
296            appendReplacement(&resultText, &replacementText, status);
297            utext_close(&resultText);
298        }
299        utext_close(&replacementText);
300    }
301
302    return *this;
303}
304
305//
306//    appendReplacement, UText mode
307//
308RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309                                              UText *replacement,
310                                              UErrorCode &status) {
311    if (U_FAILURE(status)) {
312        return *this;
313    }
314    if (U_FAILURE(fDeferredStatus)) {
315        status = fDeferredStatus;
316        return *this;
317    }
318    if (fMatch == FALSE) {
319        status = U_REGEX_INVALID_STATE;
320        return *this;
321    }
322
323    // Copy input string from the end of previous match to start of current match
324    int64_t  destLen = utext_nativeLength(dest);
325    if (fMatchStart > fAppendPosition) {
326        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328                                     (int32_t)(fMatchStart-fAppendPosition), &status);
329        } else {
330            int32_t len16;
331            if (UTEXT_USES_U16(fInputText)) {
332                len16 = (int32_t)(fMatchStart-fAppendPosition);
333            } else {
334                UErrorCode lengthStatus = U_ZERO_ERROR;
335                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336            }
337            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338            if (inputChars == NULL) {
339                status = U_MEMORY_ALLOCATION_ERROR;
340                return *this;
341            }
342            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344            uprv_free(inputChars);
345        }
346    }
347    fAppendPosition = fMatchEnd;
348
349
350    // scan the replacement text, looking for substitutions ($n) and \escapes.
351    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
352    //         move entire ranges not containing substitutions.
353    UTEXT_SETNATIVEINDEX(replacement, 0);
354    UChar32 c = UTEXT_NEXT32(replacement);
355    while (c != U_SENTINEL) {
356        if (c == BACKSLASH) {
357            // Backslash Escape.  Copy the following char out without further checks.
358            //                    Note:  Surrogate pairs don't need any special handling
359            //                           The second half wont be a '$' or a '\', and
360            //                           will move to the dest normally on the next
361            //                           loop iteration.
362            c = UTEXT_CURRENT32(replacement);
363            if (c == U_SENTINEL) {
364                break;
365            }
366
367            if (c==0x55/*U*/ || c==0x75/*u*/) {
368                // We have a \udddd or \Udddddddd escape sequence.
369                int32_t offset = 0;
370                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372                if (escapedChar != (UChar32)0xFFFFFFFF) {
373                    if (U_IS_BMP(escapedChar)) {
374                        UChar c16 = (UChar)escapedChar;
375                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376                    } else {
377                        UChar surrogate[2];
378                        surrogate[0] = U16_LEAD(escapedChar);
379                        surrogate[1] = U16_TRAIL(escapedChar);
380                        if (U_SUCCESS(status)) {
381                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382                        }
383                    }
384                    // TODO:  Report errors for mal-formed \u escapes?
385                    //        As this is, the original sequence is output, which may be OK.
386                    if (context.lastOffset == offset) {
387                        (void)UTEXT_PREVIOUS32(replacement);
388                    } else if (context.lastOffset != offset-1) {
389                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390                    }
391                }
392            } else {
393                (void)UTEXT_NEXT32(replacement);
394                // Plain backslash escape.  Just put out the escaped character.
395                if (U_IS_BMP(c)) {
396                    UChar c16 = (UChar)c;
397                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398                } else {
399                    UChar surrogate[2];
400                    surrogate[0] = U16_LEAD(c);
401                    surrogate[1] = U16_TRAIL(c);
402                    if (U_SUCCESS(status)) {
403                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404                    }
405                }
406            }
407        } else if (c != DOLLARSIGN) {
408            // Normal char, not a $.  Copy it out without further checks.
409            if (U_IS_BMP(c)) {
410                UChar c16 = (UChar)c;
411                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412            } else {
413                UChar surrogate[2];
414                surrogate[0] = U16_LEAD(c);
415                surrogate[1] = U16_TRAIL(c);
416                if (U_SUCCESS(status)) {
417                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418                }
419            }
420        } else {
421            // We've got a $.  Pick up a capture group number if one follows.
422            // Consume at most the number of digits necessary for the largest capture
423            // number that is valid for this pattern.
424
425            int32_t numDigits = 0;
426            int32_t groupNum  = 0;
427            UChar32 digitC;
428            for (;;) {
429                digitC = UTEXT_CURRENT32(replacement);
430                if (digitC == U_SENTINEL) {
431                    break;
432                }
433                if (u_isdigit(digitC) == FALSE) {
434                    break;
435                }
436                (void)UTEXT_NEXT32(replacement);
437                groupNum=groupNum*10 + u_charDigitValue(digitC);
438                numDigits++;
439                if (numDigits >= fPattern->fMaxCaptureDigits) {
440                    break;
441                }
442            }
443
444
445            if (numDigits == 0) {
446                // The $ didn't introduce a group number at all.
447                // Treat it as just part of the substitution text.
448                UChar c16 = DOLLARSIGN;
449                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450            } else {
451                // Finally, append the capture group data to the destination.
452                destLen += appendGroup(groupNum, dest, status);
453                if (U_FAILURE(status)) {
454                    // Can fail if group number is out of range.
455                    break;
456                }
457            }
458        }
459
460        if (U_FAILURE(status)) {
461            break;
462        } else {
463            c = UTEXT_NEXT32(replacement);
464        }
465    }
466
467    return *this;
468}
469
470
471
472//--------------------------------------------------------------------------------
473//
474//    appendTail     Intended to be used in conjunction with appendReplacement()
475//                   To the destination string, append everything following
476//                   the last match position from the input string.
477//
478//                   Note:  Match ranges do not affect appendTail or appendReplacement
479//
480//--------------------------------------------------------------------------------
481UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482    UErrorCode status = U_ZERO_ERROR;
483    UText resultText = UTEXT_INITIALIZER;
484    utext_openUnicodeString(&resultText, &dest, &status);
485
486    if (U_SUCCESS(status)) {
487        appendTail(&resultText, status);
488        utext_close(&resultText);
489    }
490
491    return dest;
492}
493
494//
495//   appendTail, UText mode
496//
497UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498    UBool bailOut = FALSE;
499    if (U_FAILURE(status)) {
500        bailOut = TRUE;
501    }
502    if (U_FAILURE(fDeferredStatus)) {
503        status = fDeferredStatus;
504        bailOut = TRUE;
505    }
506
507    if (bailOut) {
508        //  dest must not be NULL
509        if (dest) {
510            utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
511            return dest;
512        }
513    }
514
515    if (fInputLength > fAppendPosition) {
516        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517            int64_t destLen = utext_nativeLength(dest);
518            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                          (int32_t)(fInputLength-fAppendPosition), &status);
520        } else {
521            int32_t len16;
522            if (UTEXT_USES_U16(fInputText)) {
523                len16 = (int32_t)(fInputLength-fAppendPosition);
524            } else {
525                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526                status = U_ZERO_ERROR; // buffer overflow
527            }
528
529            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530            if (inputChars == NULL) {
531                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532            } else {
533                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                int64_t destLen = utext_nativeLength(dest);
535                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                uprv_free(inputChars);
537            }
538        }
539    }
540    return dest;
541}
542
543
544
545//--------------------------------------------------------------------------------
546//
547//   end
548//
549//--------------------------------------------------------------------------------
550int32_t RegexMatcher::end(UErrorCode &err) const {
551    return end(0, err);
552}
553
554int64_t RegexMatcher::end64(UErrorCode &err) const {
555    return end64(0, err);
556}
557
558int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559    if (U_FAILURE(err)) {
560        return -1;
561    }
562    if (fMatch == FALSE) {
563        err = U_REGEX_INVALID_STATE;
564        return -1;
565    }
566    if (group < 0 || group > fPattern->fGroupMap->size()) {
567        err = U_INDEX_OUTOFBOUNDS_ERROR;
568        return -1;
569    }
570    int64_t e = -1;
571    if (group == 0) {
572        e = fMatchEnd;
573    } else {
574        // Get the position within the stack frame of the variables for
575        //    this capture group.
576        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577        U_ASSERT(groupOffset < fPattern->fFrameSize);
578        U_ASSERT(groupOffset >= 0);
579        e = fFrame->fExtra[groupOffset + 1];
580    }
581
582        return e;
583}
584
585int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586    return (int32_t)end64(group, err);
587}
588
589
590//--------------------------------------------------------------------------------
591//
592//   find()
593//
594//--------------------------------------------------------------------------------
595UBool RegexMatcher::find() {
596    // Start at the position of the last match end.  (Will be zero if the
597    //   matcher has been reset.)
598    //
599    if (U_FAILURE(fDeferredStatus)) {
600        return FALSE;
601    }
602
603    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
604        return findUsingChunk();
605    }
606
607    int64_t startPos = fMatchEnd;
608    if (startPos==0) {
609        startPos = fActiveStart;
610    }
611
612    if (fMatch) {
613        // Save the position of any previous successful match.
614        fLastMatchEnd = fMatchEnd;
615
616        if (fMatchStart == fMatchEnd) {
617            // Previous match had zero length.  Move start position up one position
618            //  to avoid sending find() into a loop on zero-length matches.
619            if (startPos >= fActiveLimit) {
620                fMatch = FALSE;
621                fHitEnd = TRUE;
622                return FALSE;
623            }
624            UTEXT_SETNATIVEINDEX(fInputText, startPos);
625            (void)UTEXT_NEXT32(fInputText);
626            startPos = UTEXT_GETNATIVEINDEX(fInputText);
627        }
628    } else {
629        if (fLastMatchEnd >= 0) {
630            // A previous find() failed to match.  Don't try again.
631            //   (without this test, a pattern with a zero-length match
632            //    could match again at the end of an input string.)
633            fHitEnd = TRUE;
634            return FALSE;
635        }
636    }
637
638
639    // Compute the position in the input string beyond which a match can not begin, because
640    //   the minimum length match would extend past the end of the input.
641    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
642    //          Be aware of possible overflows if making changes here.
643    int64_t testStartLimit;
644    if (UTEXT_USES_U16(fInputText)) {
645        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
646        if (startPos > testStartLimit) {
647            fMatch = FALSE;
648            fHitEnd = TRUE;
649            return FALSE;
650        }
651    } else {
652        // For now, let the matcher discover that it can't match on its own
653        // We don't know how long the match len is in native characters
654        testStartLimit = fActiveLimit;
655    }
656
657    UChar32  c;
658    U_ASSERT(startPos >= 0);
659
660    switch (fPattern->fStartType) {
661    case START_NO_INFO:
662        // No optimization was found.
663        //  Try a match at each input position.
664        for (;;) {
665            MatchAt(startPos, FALSE, fDeferredStatus);
666            if (U_FAILURE(fDeferredStatus)) {
667                return FALSE;
668            }
669            if (fMatch) {
670                return TRUE;
671            }
672            if (startPos >= testStartLimit) {
673                fHitEnd = TRUE;
674                return FALSE;
675            }
676            UTEXT_SETNATIVEINDEX(fInputText, startPos);
677            (void)UTEXT_NEXT32(fInputText);
678            startPos = UTEXT_GETNATIVEINDEX(fInputText);
679            // Note that it's perfectly OK for a pattern to have a zero-length
680            //   match at the end of a string, so we must make sure that the loop
681            //   runs with startPos == testStartLimit the last time through.
682            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
683                return FALSE;
684        }
685        U_ASSERT(FALSE);
686
687    case START_START:
688        // Matches are only possible at the start of the input string
689        //   (pattern begins with ^ or \A)
690        if (startPos > fActiveStart) {
691            fMatch = FALSE;
692            return FALSE;
693        }
694        MatchAt(startPos, FALSE, fDeferredStatus);
695        if (U_FAILURE(fDeferredStatus)) {
696            return FALSE;
697        }
698        return fMatch;
699
700
701    case START_SET:
702        {
703            // Match may start on any char from a pre-computed set.
704            U_ASSERT(fPattern->fMinMatchLen > 0);
705            int64_t pos;
706            UTEXT_SETNATIVEINDEX(fInputText, startPos);
707            for (;;) {
708                c = UTEXT_NEXT32(fInputText);
709                pos = UTEXT_GETNATIVEINDEX(fInputText);
710                // c will be -1 (U_SENTINEL) at end of text, in which case we
711                // skip this next block (so we don't have a negative array index)
712                // and handle end of text in the following block.
713                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
714                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
715                    MatchAt(startPos, FALSE, fDeferredStatus);
716                    if (U_FAILURE(fDeferredStatus)) {
717                        return FALSE;
718                    }
719                    if (fMatch) {
720                        return TRUE;
721                    }
722                    UTEXT_SETNATIVEINDEX(fInputText, pos);
723                }
724                if (startPos >= testStartLimit) {
725                    fMatch = FALSE;
726                    fHitEnd = TRUE;
727                    return FALSE;
728                }
729                startPos = pos;
730	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
731                    return FALSE;
732            }
733        }
734        U_ASSERT(FALSE);
735
736    case START_STRING:
737    case START_CHAR:
738        {
739            // Match starts on exactly one char.
740            U_ASSERT(fPattern->fMinMatchLen > 0);
741            UChar32 theChar = fPattern->fInitialChar;
742            int64_t pos;
743            UTEXT_SETNATIVEINDEX(fInputText, startPos);
744            for (;;) {
745                c = UTEXT_NEXT32(fInputText);
746                pos = UTEXT_GETNATIVEINDEX(fInputText);
747                if (c == theChar) {
748                    MatchAt(startPos, FALSE, fDeferredStatus);
749                    if (U_FAILURE(fDeferredStatus)) {
750                        return FALSE;
751                    }
752                    if (fMatch) {
753                        return TRUE;
754                    }
755                    UTEXT_SETNATIVEINDEX(fInputText, pos);
756                }
757                if (startPos >= testStartLimit) {
758                    fMatch = FALSE;
759                    fHitEnd = TRUE;
760                    return FALSE;
761                }
762                startPos = pos;
763	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
764                    return FALSE;
765           }
766        }
767        U_ASSERT(FALSE);
768
769    case START_LINE:
770        {
771            UChar32  c;
772            if (startPos == fAnchorStart) {
773                MatchAt(startPos, FALSE, fDeferredStatus);
774                if (U_FAILURE(fDeferredStatus)) {
775                    return FALSE;
776                }
777                if (fMatch) {
778                    return TRUE;
779                }
780                UTEXT_SETNATIVEINDEX(fInputText, startPos);
781                c = UTEXT_NEXT32(fInputText);
782                startPos = UTEXT_GETNATIVEINDEX(fInputText);
783            } else {
784                UTEXT_SETNATIVEINDEX(fInputText, startPos);
785                c = UTEXT_PREVIOUS32(fInputText);
786                UTEXT_SETNATIVEINDEX(fInputText, startPos);
787            }
788
789            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
790                for (;;) {
791                    if (c == 0x0a) {
792                            MatchAt(startPos, FALSE, fDeferredStatus);
793                            if (U_FAILURE(fDeferredStatus)) {
794                                return FALSE;
795                            }
796                            if (fMatch) {
797                                return TRUE;
798                            }
799                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
800                    }
801                    if (startPos >= testStartLimit) {
802                        fMatch = FALSE;
803                        fHitEnd = TRUE;
804                        return FALSE;
805                    }
806                    c = UTEXT_NEXT32(fInputText);
807                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
808                    // Note that it's perfectly OK for a pattern to have a zero-length
809                    //   match at the end of a string, so we must make sure that the loop
810                    //   runs with startPos == testStartLimit the last time through.
811		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
812                        return FALSE;
813                }
814            } else {
815                for (;;) {
816                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
817                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
818                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
819                                (void)UTEXT_NEXT32(fInputText);
820                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
821                            }
822                            MatchAt(startPos, FALSE, fDeferredStatus);
823                            if (U_FAILURE(fDeferredStatus)) {
824                                return FALSE;
825                            }
826                            if (fMatch) {
827                                return TRUE;
828                            }
829                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
830                    }
831                    if (startPos >= testStartLimit) {
832                        fMatch = FALSE;
833                        fHitEnd = TRUE;
834                        return FALSE;
835                    }
836                    c = UTEXT_NEXT32(fInputText);
837                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
838                    // Note that it's perfectly OK for a pattern to have a zero-length
839                    //   match at the end of a string, so we must make sure that the loop
840                    //   runs with startPos == testStartLimit the last time through.
841		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
842                        return FALSE;
843                }
844            }
845        }
846
847    default:
848        U_ASSERT(FALSE);
849    }
850
851    U_ASSERT(FALSE);
852    return FALSE;
853}
854
855
856
857UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
858    if (U_FAILURE(status)) {
859        return FALSE;
860    }
861    if (U_FAILURE(fDeferredStatus)) {
862        status = fDeferredStatus;
863        return FALSE;
864    }
865    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
866                                          //        This will reset the region to be the full input length.
867    if (start < 0) {
868        status = U_INDEX_OUTOFBOUNDS_ERROR;
869        return FALSE;
870    }
871
872    int64_t nativeStart = start;
873    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
874        status = U_INDEX_OUTOFBOUNDS_ERROR;
875        return FALSE;
876    }
877    fMatchEnd = nativeStart;
878    return find();
879}
880
881
882//--------------------------------------------------------------------------------
883//
884//   findUsingChunk() -- like find(), but with the advance knowledge that the
885//                       entire string is available in the UText's chunk buffer.
886//
887//--------------------------------------------------------------------------------
888UBool RegexMatcher::findUsingChunk() {
889    // Start at the position of the last match end.  (Will be zero if the
890    //   matcher has been reset.
891    //
892
893    int32_t startPos = (int32_t)fMatchEnd;
894    if (startPos==0) {
895        startPos = (int32_t)fActiveStart;
896    }
897
898    const UChar *inputBuf = fInputText->chunkContents;
899
900    if (fMatch) {
901        // Save the position of any previous successful match.
902        fLastMatchEnd = fMatchEnd;
903
904        if (fMatchStart == fMatchEnd) {
905            // Previous match had zero length.  Move start position up one position
906            //  to avoid sending find() into a loop on zero-length matches.
907            if (startPos >= fActiveLimit) {
908                fMatch = FALSE;
909                fHitEnd = TRUE;
910                return FALSE;
911            }
912            U16_FWD_1(inputBuf, startPos, fInputLength);
913        }
914    } else {
915        if (fLastMatchEnd >= 0) {
916            // A previous find() failed to match.  Don't try again.
917            //   (without this test, a pattern with a zero-length match
918            //    could match again at the end of an input string.)
919            fHitEnd = TRUE;
920            return FALSE;
921        }
922    }
923
924
925    // Compute the position in the input string beyond which a match can not begin, because
926    //   the minimum length match would extend past the end of the input.
927    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
928    //          Be aware of possible overflows if making changes here.
929    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
930    if (startPos > testLen) {
931        fMatch = FALSE;
932        fHitEnd = TRUE;
933        return FALSE;
934    }
935
936    UChar32  c;
937    U_ASSERT(startPos >= 0);
938
939    switch (fPattern->fStartType) {
940    case START_NO_INFO:
941        // No optimization was found.
942        //  Try a match at each input position.
943        for (;;) {
944            MatchChunkAt(startPos, FALSE, fDeferredStatus);
945            if (U_FAILURE(fDeferredStatus)) {
946                return FALSE;
947            }
948            if (fMatch) {
949                return TRUE;
950            }
951            if (startPos >= testLen) {
952                fHitEnd = TRUE;
953                return FALSE;
954            }
955            U16_FWD_1(inputBuf, startPos, fActiveLimit);
956            // Note that it's perfectly OK for a pattern to have a zero-length
957            //   match at the end of a string, so we must make sure that the loop
958            //   runs with startPos == testLen the last time through.
959            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
960                return FALSE;
961        }
962        U_ASSERT(FALSE);
963
964    case START_START:
965        // Matches are only possible at the start of the input string
966        //   (pattern begins with ^ or \A)
967        if (startPos > fActiveStart) {
968            fMatch = FALSE;
969            return FALSE;
970        }
971        MatchChunkAt(startPos, FALSE, fDeferredStatus);
972        if (U_FAILURE(fDeferredStatus)) {
973            return FALSE;
974        }
975        return fMatch;
976
977
978    case START_SET:
979    {
980        // Match may start on any char from a pre-computed set.
981        U_ASSERT(fPattern->fMinMatchLen > 0);
982        for (;;) {
983            int32_t pos = startPos;
984            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
985            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
986                (c>=256 && fPattern->fInitialChars->contains(c))) {
987                MatchChunkAt(pos, FALSE, fDeferredStatus);
988                if (U_FAILURE(fDeferredStatus)) {
989                    return FALSE;
990                }
991                if (fMatch) {
992                    return TRUE;
993                }
994            }
995            if (pos >= testLen) {
996                fMatch = FALSE;
997                fHitEnd = TRUE;
998                return FALSE;
999            }
1000            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1001                return FALSE;
1002        }
1003    }
1004        U_ASSERT(FALSE);
1005
1006    case START_STRING:
1007    case START_CHAR:
1008    {
1009        // Match starts on exactly one char.
1010        U_ASSERT(fPattern->fMinMatchLen > 0);
1011        UChar32 theChar = fPattern->fInitialChar;
1012        for (;;) {
1013            int32_t pos = startPos;
1014            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1015            if (c == theChar) {
1016                MatchChunkAt(pos, FALSE, fDeferredStatus);
1017                if (U_FAILURE(fDeferredStatus)) {
1018                    return FALSE;
1019                }
1020                if (fMatch) {
1021                    return TRUE;
1022                }
1023            }
1024            if (pos >= testLen) {
1025                fMatch = FALSE;
1026                fHitEnd = TRUE;
1027                return FALSE;
1028            }
1029            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1030                return FALSE;
1031        }
1032    }
1033        U_ASSERT(FALSE);
1034
1035    case START_LINE:
1036    {
1037        UChar32  c;
1038        if (startPos == fAnchorStart) {
1039            MatchChunkAt(startPos, FALSE, fDeferredStatus);
1040            if (U_FAILURE(fDeferredStatus)) {
1041                return FALSE;
1042            }
1043            if (fMatch) {
1044                return TRUE;
1045            }
1046            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1047        }
1048
1049        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1050            for (;;) {
1051                c = inputBuf[startPos-1];
1052                if (c == 0x0a) {
1053                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1054                    if (U_FAILURE(fDeferredStatus)) {
1055                        return FALSE;
1056                    }
1057                    if (fMatch) {
1058                        return TRUE;
1059                    }
1060                }
1061                if (startPos >= testLen) {
1062                    fMatch = FALSE;
1063                    fHitEnd = TRUE;
1064                    return FALSE;
1065                }
1066                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1067                // Note that it's perfectly OK for a pattern to have a zero-length
1068                //   match at the end of a string, so we must make sure that the loop
1069                //   runs with startPos == testLen the last time through.
1070	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1071                    return FALSE;
1072            }
1073        } else {
1074            for (;;) {
1075                c = inputBuf[startPos-1];
1076                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1077                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1078                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1079                        startPos++;
1080                    }
1081                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1082                    if (U_FAILURE(fDeferredStatus)) {
1083                        return FALSE;
1084                    }
1085                    if (fMatch) {
1086                        return TRUE;
1087                    }
1088                }
1089                if (startPos >= testLen) {
1090                    fMatch = FALSE;
1091                    fHitEnd = TRUE;
1092                    return FALSE;
1093                }
1094                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1095                // Note that it's perfectly OK for a pattern to have a zero-length
1096                //   match at the end of a string, so we must make sure that the loop
1097                //   runs with startPos == testLen the last time through.
1098	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1099                    return FALSE;
1100            }
1101        }
1102    }
1103
1104    default:
1105        U_ASSERT(FALSE);
1106    }
1107
1108    U_ASSERT(FALSE);
1109    return FALSE;
1110}
1111
1112
1113
1114//--------------------------------------------------------------------------------
1115//
1116//  group()
1117//
1118//--------------------------------------------------------------------------------
1119UnicodeString RegexMatcher::group(UErrorCode &status) const {
1120    return group(0, status);
1121}
1122
1123//  Return immutable shallow clone
1124UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1125    return group(0, dest, group_len, status);
1126}
1127
1128//  Return immutable shallow clone
1129UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1130    group_len = 0;
1131    UBool bailOut = FALSE;
1132    if (U_FAILURE(status)) {
1133        return dest;
1134    }
1135    if (U_FAILURE(fDeferredStatus)) {
1136        status = fDeferredStatus;
1137        bailOut = TRUE;
1138    }
1139    if (fMatch == FALSE) {
1140        status = U_REGEX_INVALID_STATE;
1141        bailOut = TRUE;
1142    }
1143    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1144        status = U_INDEX_OUTOFBOUNDS_ERROR;
1145        bailOut = TRUE;
1146    }
1147
1148    if (bailOut) {
1149        return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1150    }
1151
1152    int64_t s, e;
1153    if (groupNum == 0) {
1154        s = fMatchStart;
1155        e = fMatchEnd;
1156    } else {
1157        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1158        U_ASSERT(groupOffset < fPattern->fFrameSize);
1159        U_ASSERT(groupOffset >= 0);
1160        s = fFrame->fExtra[groupOffset];
1161        e = fFrame->fExtra[groupOffset+1];
1162    }
1163
1164    if (s < 0) {
1165        // A capture group wasn't part of the match
1166        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1167    }
1168    U_ASSERT(s <= e);
1169    group_len = e - s;
1170
1171    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1172    if (dest)
1173        UTEXT_SETNATIVEINDEX(dest, s);
1174    return dest;
1175}
1176
1177UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1178    UnicodeString result;
1179    if (U_FAILURE(status)) {
1180        return result;
1181    }
1182    UText resultText = UTEXT_INITIALIZER;
1183    utext_openUnicodeString(&resultText, &result, &status);
1184    group(groupNum, &resultText, status);
1185    utext_close(&resultText);
1186    return result;
1187}
1188
1189
1190//  Return deep (mutable) clone
1191//		Technology Preview (as an API), but note that the UnicodeString API is implemented
1192//		using this function.
1193UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1194    UBool bailOut = FALSE;
1195    if (U_FAILURE(status)) {
1196        return dest;
1197    }
1198    if (U_FAILURE(fDeferredStatus)) {
1199        status = fDeferredStatus;
1200        bailOut = TRUE;
1201    }
1202
1203    if (fMatch == FALSE) {
1204        status = U_REGEX_INVALID_STATE;
1205        bailOut = TRUE;
1206    }
1207    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1208        status = U_INDEX_OUTOFBOUNDS_ERROR;
1209        bailOut = TRUE;
1210    }
1211
1212    if (bailOut) {
1213        if (dest) {
1214            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1215            return dest;
1216        } else {
1217            return utext_openUChars(NULL, NULL, 0, &status);
1218        }
1219    }
1220
1221    int64_t s, e;
1222    if (groupNum == 0) {
1223        s = fMatchStart;
1224        e = fMatchEnd;
1225    } else {
1226        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1227        U_ASSERT(groupOffset < fPattern->fFrameSize);
1228        U_ASSERT(groupOffset >= 0);
1229        s = fFrame->fExtra[groupOffset];
1230        e = fFrame->fExtra[groupOffset+1];
1231    }
1232
1233    if (s < 0) {
1234        // A capture group wasn't part of the match
1235        if (dest) {
1236            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1237            return dest;
1238        } else {
1239            return utext_openUChars(NULL, NULL, 0, &status);
1240        }
1241    }
1242    U_ASSERT(s <= e);
1243
1244    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1245        U_ASSERT(e <= fInputLength);
1246        if (dest) {
1247            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1248        } else {
1249            UText groupText = UTEXT_INITIALIZER;
1250            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1251            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1252            utext_close(&groupText);
1253        }
1254    } else {
1255        int32_t len16;
1256        if (UTEXT_USES_U16(fInputText)) {
1257            len16 = (int32_t)(e-s);
1258        } else {
1259            UErrorCode lengthStatus = U_ZERO_ERROR;
1260            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1261        }
1262        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1263        if (groupChars == NULL) {
1264            status = U_MEMORY_ALLOCATION_ERROR;
1265            return dest;
1266        }
1267        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1268
1269        if (dest) {
1270            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1271        } else {
1272            UText groupText = UTEXT_INITIALIZER;
1273            utext_openUChars(&groupText, groupChars, len16, &status);
1274            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1275            utext_close(&groupText);
1276        }
1277
1278        uprv_free(groupChars);
1279    }
1280    return dest;
1281}
1282
1283//--------------------------------------------------------------------------------
1284//
1285//  appendGroup() -- currently internal only, appends a group to a UText rather
1286//                   than replacing its contents
1287//
1288//--------------------------------------------------------------------------------
1289
1290int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1291    if (U_FAILURE(status)) {
1292        return 0;
1293    }
1294    if (U_FAILURE(fDeferredStatus)) {
1295        status = fDeferredStatus;
1296        return 0;
1297    }
1298    int64_t destLen = utext_nativeLength(dest);
1299
1300    if (fMatch == FALSE) {
1301        status = U_REGEX_INVALID_STATE;
1302        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1303    }
1304    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1305        status = U_INDEX_OUTOFBOUNDS_ERROR;
1306        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1307    }
1308
1309    int64_t s, e;
1310    if (groupNum == 0) {
1311        s = fMatchStart;
1312        e = fMatchEnd;
1313    } else {
1314        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1315        U_ASSERT(groupOffset < fPattern->fFrameSize);
1316        U_ASSERT(groupOffset >= 0);
1317        s = fFrame->fExtra[groupOffset];
1318        e = fFrame->fExtra[groupOffset+1];
1319    }
1320
1321    if (s < 0) {
1322        // A capture group wasn't part of the match
1323        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1324    }
1325    U_ASSERT(s <= e);
1326
1327    int64_t deltaLen;
1328    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1329        U_ASSERT(e <= fInputLength);
1330        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1331    } else {
1332        int32_t len16;
1333        if (UTEXT_USES_U16(fInputText)) {
1334            len16 = (int32_t)(e-s);
1335        } else {
1336            UErrorCode lengthStatus = U_ZERO_ERROR;
1337            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1338        }
1339        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1340        if (groupChars == NULL) {
1341            status = U_MEMORY_ALLOCATION_ERROR;
1342            return 0;
1343        }
1344        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1345
1346        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1347        uprv_free(groupChars);
1348    }
1349    return deltaLen;
1350}
1351
1352
1353
1354//--------------------------------------------------------------------------------
1355//
1356//  groupCount()
1357//
1358//--------------------------------------------------------------------------------
1359int32_t RegexMatcher::groupCount() const {
1360    return fPattern->fGroupMap->size();
1361}
1362
1363
1364
1365//--------------------------------------------------------------------------------
1366//
1367//  hasAnchoringBounds()
1368//
1369//--------------------------------------------------------------------------------
1370UBool RegexMatcher::hasAnchoringBounds() const {
1371    return fAnchoringBounds;
1372}
1373
1374
1375//--------------------------------------------------------------------------------
1376//
1377//  hasTransparentBounds()
1378//
1379//--------------------------------------------------------------------------------
1380UBool RegexMatcher::hasTransparentBounds() const {
1381    return fTransparentBounds;
1382}
1383
1384
1385
1386//--------------------------------------------------------------------------------
1387//
1388//  hitEnd()
1389//
1390//--------------------------------------------------------------------------------
1391UBool RegexMatcher::hitEnd() const {
1392    return fHitEnd;
1393}
1394
1395
1396//--------------------------------------------------------------------------------
1397//
1398//  input()
1399//
1400//--------------------------------------------------------------------------------
1401const UnicodeString &RegexMatcher::input() const {
1402    if (!fInput) {
1403        UErrorCode status = U_ZERO_ERROR;
1404        int32_t len16;
1405        if (UTEXT_USES_U16(fInputText)) {
1406            len16 = (int32_t)fInputLength;
1407        } else {
1408            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1409            status = U_ZERO_ERROR; // overflow, length status
1410        }
1411        UnicodeString *result = new UnicodeString(len16, 0, 0);
1412
1413        UChar *inputChars = result->getBuffer(len16);
1414        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1415        result->releaseBuffer(len16);
1416
1417        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1418    }
1419
1420    return *fInput;
1421}
1422
1423//--------------------------------------------------------------------------------
1424//
1425//  inputText()
1426//
1427//--------------------------------------------------------------------------------
1428UText *RegexMatcher::inputText() const {
1429    return fInputText;
1430}
1431
1432
1433//--------------------------------------------------------------------------------
1434//
1435//  getInput() -- like inputText(), but makes a clone or copies into another UText
1436//
1437//--------------------------------------------------------------------------------
1438UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1439    UBool bailOut = FALSE;
1440    if (U_FAILURE(status)) {
1441        return dest;
1442    }
1443    if (U_FAILURE(fDeferredStatus)) {
1444        status = fDeferredStatus;
1445        bailOut = TRUE;
1446    }
1447
1448    if (bailOut) {
1449        if (dest) {
1450            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1451            return dest;
1452        } else {
1453            return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1454        }
1455    }
1456
1457    if (dest) {
1458        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1459            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1460        } else {
1461            int32_t input16Len;
1462            if (UTEXT_USES_U16(fInputText)) {
1463                input16Len = (int32_t)fInputLength;
1464            } else {
1465                UErrorCode lengthStatus = U_ZERO_ERROR;
1466                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1467            }
1468            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1469            if (inputChars == NULL) {
1470                return dest;
1471            }
1472
1473            status = U_ZERO_ERROR;
1474            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1475            status = U_ZERO_ERROR;
1476            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1477
1478            uprv_free(inputChars);
1479        }
1480        return dest;
1481    } else {
1482        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1483    }
1484}
1485
1486
1487static UBool compat_SyncMutableUTextContents(UText *ut);
1488static UBool compat_SyncMutableUTextContents(UText *ut) {
1489    UBool retVal = FALSE;
1490
1491    //  In the following test, we're really only interested in whether the UText should switch
1492    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1493    //  will still point to the correct data.
1494    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1495        UnicodeString *us=(UnicodeString *)ut->context;
1496
1497        // Update to the latest length.
1498        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499        int32_t newLength = us->length();
1500
1501        // Update the chunk description.
1502        // The buffer may have switched between stack- and heap-based.
1503        ut->chunkContents    = us->getBuffer();
1504        ut->chunkLength      = newLength;
1505        ut->chunkNativeLimit = newLength;
1506        ut->nativeIndexingLimit = newLength;
1507        retVal = TRUE;
1508    }
1509
1510    return retVal;
1511}
1512
1513//--------------------------------------------------------------------------------
1514//
1515//  lookingAt()
1516//
1517//--------------------------------------------------------------------------------
1518UBool RegexMatcher::lookingAt(UErrorCode &status) {
1519    if (U_FAILURE(status)) {
1520        return FALSE;
1521    }
1522    if (U_FAILURE(fDeferredStatus)) {
1523        status = fDeferredStatus;
1524        return FALSE;
1525    }
1526
1527    if (fInputUniStrMaybeMutable) {
1528        if (compat_SyncMutableUTextContents(fInputText)) {
1529        fInputLength = utext_nativeLength(fInputText);
1530        reset();
1531        }
1532    }
1533    else {
1534        resetPreserveRegion();
1535    }
1536    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1537        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1538    } else {
1539        MatchAt(fActiveStart, FALSE, status);
1540    }
1541    return fMatch;
1542}
1543
1544
1545UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1546    if (U_FAILURE(status)) {
1547        return FALSE;
1548    }
1549    if (U_FAILURE(fDeferredStatus)) {
1550        status = fDeferredStatus;
1551        return FALSE;
1552    }
1553    reset();
1554
1555    if (start < 0) {
1556        status = U_INDEX_OUTOFBOUNDS_ERROR;
1557        return FALSE;
1558    }
1559
1560    if (fInputUniStrMaybeMutable) {
1561        if (compat_SyncMutableUTextContents(fInputText)) {
1562        fInputLength = utext_nativeLength(fInputText);
1563        reset();
1564        }
1565    }
1566
1567    int64_t nativeStart;
1568    nativeStart = start;
1569    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1570        status = U_INDEX_OUTOFBOUNDS_ERROR;
1571        return FALSE;
1572    }
1573
1574    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1575        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1576    } else {
1577        MatchAt(nativeStart, FALSE, status);
1578    }
1579    return fMatch;
1580}
1581
1582
1583
1584//--------------------------------------------------------------------------------
1585//
1586//  matches()
1587//
1588//--------------------------------------------------------------------------------
1589UBool RegexMatcher::matches(UErrorCode &status) {
1590    if (U_FAILURE(status)) {
1591        return FALSE;
1592    }
1593    if (U_FAILURE(fDeferredStatus)) {
1594        status = fDeferredStatus;
1595        return FALSE;
1596    }
1597
1598    if (fInputUniStrMaybeMutable) {
1599        if (compat_SyncMutableUTextContents(fInputText)) {
1600        fInputLength = utext_nativeLength(fInputText);
1601        reset();
1602        }
1603    }
1604    else {
1605        resetPreserveRegion();
1606    }
1607
1608    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1609        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1610    } else {
1611        MatchAt(fActiveStart, TRUE, status);
1612    }
1613    return fMatch;
1614}
1615
1616
1617UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1618    if (U_FAILURE(status)) {
1619        return FALSE;
1620    }
1621    if (U_FAILURE(fDeferredStatus)) {
1622        status = fDeferredStatus;
1623        return FALSE;
1624    }
1625    reset();
1626
1627    if (start < 0) {
1628        status = U_INDEX_OUTOFBOUNDS_ERROR;
1629        return FALSE;
1630    }
1631
1632    if (fInputUniStrMaybeMutable) {
1633        if (compat_SyncMutableUTextContents(fInputText)) {
1634        fInputLength = utext_nativeLength(fInputText);
1635        reset();
1636        }
1637    }
1638
1639    int64_t nativeStart;
1640    nativeStart = start;
1641    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1642        status = U_INDEX_OUTOFBOUNDS_ERROR;
1643        return FALSE;
1644    }
1645
1646    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1647        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1648    } else {
1649        MatchAt(nativeStart, TRUE, status);
1650    }
1651    return fMatch;
1652}
1653
1654
1655
1656//--------------------------------------------------------------------------------
1657//
1658//    pattern
1659//
1660//--------------------------------------------------------------------------------
1661const RegexPattern &RegexMatcher::pattern() const {
1662    return *fPattern;
1663}
1664
1665
1666
1667//--------------------------------------------------------------------------------
1668//
1669//    region
1670//
1671//--------------------------------------------------------------------------------
1672RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1673    if (U_FAILURE(status)) {
1674        return *this;
1675    }
1676
1677    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1678        status = U_ILLEGAL_ARGUMENT_ERROR;
1679    }
1680
1681    int64_t nativeStart = regionStart;
1682    int64_t nativeLimit = regionLimit;
1683    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1684      status = U_ILLEGAL_ARGUMENT_ERROR;
1685    }
1686
1687    if (startIndex == -1)
1688      this->reset();
1689    else
1690      resetPreserveRegion();
1691
1692    fRegionStart = nativeStart;
1693    fRegionLimit = nativeLimit;
1694    fActiveStart = nativeStart;
1695    fActiveLimit = nativeLimit;
1696
1697    if (startIndex != -1) {
1698      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1699          status = U_INDEX_OUTOFBOUNDS_ERROR;
1700      }
1701      fMatchEnd = startIndex;
1702    }
1703
1704    if (!fTransparentBounds) {
1705        fLookStart = nativeStart;
1706        fLookLimit = nativeLimit;
1707    }
1708    if (fAnchoringBounds) {
1709        fAnchorStart = nativeStart;
1710        fAnchorLimit = nativeLimit;
1711    }
1712    return *this;
1713}
1714
1715RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1716  return region(start, limit, -1, status);
1717}
1718
1719//--------------------------------------------------------------------------------
1720//
1721//    regionEnd
1722//
1723//--------------------------------------------------------------------------------
1724int32_t RegexMatcher::regionEnd() const {
1725    return (int32_t)fRegionLimit;
1726}
1727
1728int64_t RegexMatcher::regionEnd64() const {
1729    return fRegionLimit;
1730}
1731
1732//--------------------------------------------------------------------------------
1733//
1734//    regionStart
1735//
1736//--------------------------------------------------------------------------------
1737int32_t RegexMatcher::regionStart() const {
1738    return (int32_t)fRegionStart;
1739}
1740
1741int64_t RegexMatcher::regionStart64() const {
1742    return fRegionStart;
1743}
1744
1745
1746//--------------------------------------------------------------------------------
1747//
1748//    replaceAll
1749//
1750//--------------------------------------------------------------------------------
1751UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1752    UText replacementText = UTEXT_INITIALIZER;
1753    UText resultText = UTEXT_INITIALIZER;
1754    UnicodeString resultString;
1755    if (U_FAILURE(status)) {
1756        return resultString;
1757    }
1758
1759    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1760    utext_openUnicodeString(&resultText, &resultString, &status);
1761
1762    replaceAll(&replacementText, &resultText, status);
1763
1764    utext_close(&resultText);
1765    utext_close(&replacementText);
1766
1767    return resultString;
1768}
1769
1770
1771//
1772//    replaceAll, UText mode
1773//
1774UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1775    if (U_FAILURE(status)) {
1776        return dest;
1777    }
1778    if (U_FAILURE(fDeferredStatus)) {
1779        status = fDeferredStatus;
1780        return dest;
1781    }
1782
1783    if (dest == NULL) {
1784        UnicodeString emptyString;
1785        UText empty = UTEXT_INITIALIZER;
1786
1787        utext_openUnicodeString(&empty, &emptyString, &status);
1788        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1789        utext_close(&empty);
1790    }
1791
1792    if (U_SUCCESS(status)) {
1793        reset();
1794        while (find()) {
1795            appendReplacement(dest, replacement, status);
1796            if (U_FAILURE(status)) {
1797                break;
1798            }
1799        }
1800        appendTail(dest, status);
1801    }
1802
1803    return dest;
1804}
1805
1806
1807//--------------------------------------------------------------------------------
1808//
1809//    replaceFirst
1810//
1811//--------------------------------------------------------------------------------
1812UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1813    UText replacementText = UTEXT_INITIALIZER;
1814    UText resultText = UTEXT_INITIALIZER;
1815    UnicodeString resultString;
1816
1817    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1818    utext_openUnicodeString(&resultText, &resultString, &status);
1819
1820    replaceFirst(&replacementText, &resultText, status);
1821
1822    utext_close(&resultText);
1823    utext_close(&replacementText);
1824
1825    return resultString;
1826}
1827
1828//
1829//    replaceFirst, UText mode
1830//
1831UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1832    if (U_FAILURE(status)) {
1833        return dest;
1834    }
1835    if (U_FAILURE(fDeferredStatus)) {
1836        status = fDeferredStatus;
1837        return dest;
1838    }
1839
1840    reset();
1841    if (!find()) {
1842        return getInput(dest, status);
1843    }
1844
1845    if (dest == NULL) {
1846        UnicodeString emptyString;
1847        UText empty = UTEXT_INITIALIZER;
1848
1849        utext_openUnicodeString(&empty, &emptyString, &status);
1850        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1851        utext_close(&empty);
1852    }
1853
1854    appendReplacement(dest, replacement, status);
1855    appendTail(dest, status);
1856
1857    return dest;
1858}
1859
1860
1861//--------------------------------------------------------------------------------
1862//
1863//     requireEnd
1864//
1865//--------------------------------------------------------------------------------
1866UBool RegexMatcher::requireEnd() const {
1867    return fRequireEnd;
1868}
1869
1870
1871//--------------------------------------------------------------------------------
1872//
1873//     reset
1874//
1875//--------------------------------------------------------------------------------
1876RegexMatcher &RegexMatcher::reset() {
1877    fRegionStart    = 0;
1878    fRegionLimit    = fInputLength;
1879    fActiveStart    = 0;
1880    fActiveLimit    = fInputLength;
1881    fAnchorStart    = 0;
1882    fAnchorLimit    = fInputLength;
1883    fLookStart      = 0;
1884    fLookLimit      = fInputLength;
1885    resetPreserveRegion();
1886    return *this;
1887}
1888
1889
1890
1891void RegexMatcher::resetPreserveRegion() {
1892    fMatchStart     = 0;
1893    fMatchEnd       = 0;
1894    fLastMatchEnd   = -1;
1895    fAppendPosition = 0;
1896    fMatch          = FALSE;
1897    fHitEnd         = FALSE;
1898    fRequireEnd     = FALSE;
1899    fTime           = 0;
1900    fTickCounter    = TIMER_INITIAL_VALUE;
1901    //resetStack(); // more expensive than it looks...
1902}
1903
1904
1905RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1906    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1907    if (fPattern->fNeedsAltInput) {
1908        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1909    }
1910    fInputLength = utext_nativeLength(fInputText);
1911
1912    reset();
1913    delete fInput;
1914    fInput = NULL;
1915
1916    //  Do the following for any UnicodeString.
1917    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1918    fInputUniStrMaybeMutable = TRUE;
1919
1920    if (fWordBreakItr != NULL) {
1921#if UCONFIG_NO_BREAK_ITERATION==0
1922        UErrorCode status = U_ZERO_ERROR;
1923        fWordBreakItr->setText(fInputText, status);
1924#endif
1925    }
1926    return *this;
1927}
1928
1929
1930RegexMatcher &RegexMatcher::reset(UText *input) {
1931    if (fInputText != input) {
1932        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1933        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1934        fInputLength = utext_nativeLength(fInputText);
1935
1936        delete fInput;
1937        fInput = NULL;
1938
1939        if (fWordBreakItr != NULL) {
1940#if UCONFIG_NO_BREAK_ITERATION==0
1941            UErrorCode status = U_ZERO_ERROR;
1942            fWordBreakItr->setText(input, status);
1943#endif
1944        }
1945    }
1946    reset();
1947    fInputUniStrMaybeMutable = FALSE;
1948
1949    return *this;
1950}
1951
1952/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1954    return *this;
1955}*/
1956
1957RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1958    if (U_FAILURE(status)) {
1959        return *this;
1960    }
1961    reset();       // Reset also resets the region to be the entire string.
1962
1963    if (position < 0 || position > fActiveLimit) {
1964        status = U_INDEX_OUTOFBOUNDS_ERROR;
1965        return *this;
1966    }
1967    fMatchEnd = position;
1968    return *this;
1969}
1970
1971
1972//--------------------------------------------------------------------------------
1973//
1974//    refresh
1975//
1976//--------------------------------------------------------------------------------
1977RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1978    if (U_FAILURE(status)) {
1979        return *this;
1980    }
1981    if (input == NULL) {
1982        status = U_ILLEGAL_ARGUMENT_ERROR;
1983        return *this;
1984    }
1985    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1986        status = U_ILLEGAL_ARGUMENT_ERROR;
1987        return *this;
1988    }
1989    int64_t  pos = utext_getNativeIndex(fInputText);
1990    //  Shallow read-only clone of the new UText into the existing input UText
1991    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1992    if (U_FAILURE(status)) {
1993        return *this;
1994    }
1995    utext_setNativeIndex(fInputText, pos);
1996
1997    if (fAltInputText != NULL) {
1998        pos = utext_getNativeIndex(fAltInputText);
1999        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
2000        if (U_FAILURE(status)) {
2001            return *this;
2002        }
2003        utext_setNativeIndex(fAltInputText, pos);
2004    }
2005    return *this;
2006}
2007
2008
2009
2010//--------------------------------------------------------------------------------
2011//
2012//    setTrace
2013//
2014//--------------------------------------------------------------------------------
2015void RegexMatcher::setTrace(UBool state) {
2016    fTraceDebug = state;
2017}
2018
2019
2020
2021//---------------------------------------------------------------------
2022//
2023//   split
2024//
2025//---------------------------------------------------------------------
2026int32_t  RegexMatcher::split(const UnicodeString &input,
2027        UnicodeString    dest[],
2028        int32_t          destCapacity,
2029        UErrorCode      &status)
2030{
2031    UText inputText = UTEXT_INITIALIZER;
2032    utext_openConstUnicodeString(&inputText, &input, &status);
2033    if (U_FAILURE(status)) {
2034        return 0;
2035    }
2036
2037    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2038    if (destText == NULL) {
2039        status = U_MEMORY_ALLOCATION_ERROR;
2040        return 0;
2041    }
2042    int32_t i;
2043    for (i = 0; i < destCapacity; i++) {
2044        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2045    }
2046
2047    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2048
2049    for (i = 0; i < destCapacity; i++) {
2050        utext_close(destText[i]);
2051    }
2052
2053    uprv_free(destText);
2054    utext_close(&inputText);
2055    return fieldCount;
2056}
2057
2058//
2059//   split, UText mode
2060//
2061int32_t  RegexMatcher::split(UText *input,
2062        UText           *dest[],
2063        int32_t          destCapacity,
2064        UErrorCode      &status)
2065{
2066    //
2067    // Check arguements for validity
2068    //
2069    if (U_FAILURE(status)) {
2070        return 0;
2071    };
2072
2073    if (destCapacity < 1) {
2074        status = U_ILLEGAL_ARGUMENT_ERROR;
2075        return 0;
2076    }
2077
2078    //
2079    // Reset for the input text
2080    //
2081    reset(input);
2082    int64_t   nextOutputStringStart = 0;
2083    if (fActiveLimit == 0) {
2084        return 0;
2085    }
2086
2087    //
2088    // Loop through the input text, searching for the delimiter pattern
2089    //
2090    int32_t i;
2091    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2092    for (i=0; ; i++) {
2093        if (i>=destCapacity-1) {
2094            // There is one or zero output string left.
2095            // Fill the last output string with whatever is left from the input, then exit the loop.
2096            //  ( i will be == destCapacity if we filled the output array while processing
2097            //    capture groups of the delimiter expression, in which case we will discard the
2098            //    last capture group saved in favor of the unprocessed remainder of the
2099            //    input string.)
2100            i = destCapacity-1;
2101            if (fActiveLimit > nextOutputStringStart) {
2102                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2103                    if (dest[i]) {
2104                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2105                                      input->chunkContents+nextOutputStringStart,
2106                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2107                    } else {
2108                        UText remainingText = UTEXT_INITIALIZER;
2109                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2110                                         fActiveLimit-nextOutputStringStart, &status);
2111                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2112                        utext_close(&remainingText);
2113                    }
2114                } else {
2115                    UErrorCode lengthStatus = U_ZERO_ERROR;
2116                    int32_t remaining16Length =
2117                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2118                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2119                    if (remainingChars == NULL) {
2120                        status = U_MEMORY_ALLOCATION_ERROR;
2121                        break;
2122                    }
2123
2124                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2125                    if (dest[i]) {
2126                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2127                    } else {
2128                        UText remainingText = UTEXT_INITIALIZER;
2129                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2130                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2131                        utext_close(&remainingText);
2132                    }
2133
2134                    uprv_free(remainingChars);
2135                }
2136            }
2137            break;
2138        }
2139        if (find()) {
2140            // We found another delimiter.  Move everything from where we started looking
2141            //  up until the start of the delimiter into the next output string.
2142            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2143                if (dest[i]) {
2144                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2145                                  input->chunkContents+nextOutputStringStart,
2146                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2147                } else {
2148                    UText remainingText = UTEXT_INITIALIZER;
2149                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2150                                      fMatchStart-nextOutputStringStart, &status);
2151                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2152                    utext_close(&remainingText);
2153                }
2154            } else {
2155                UErrorCode lengthStatus = U_ZERO_ERROR;
2156                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2157                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2158                if (remainingChars == NULL) {
2159                    status = U_MEMORY_ALLOCATION_ERROR;
2160                    break;
2161                }
2162                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2163                if (dest[i]) {
2164                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2165                } else {
2166                    UText remainingText = UTEXT_INITIALIZER;
2167                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2168                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2169                    utext_close(&remainingText);
2170                }
2171
2172                uprv_free(remainingChars);
2173            }
2174            nextOutputStringStart = fMatchEnd;
2175
2176            // If the delimiter pattern has capturing parentheses, the captured
2177            //  text goes out into the next n destination strings.
2178            int32_t groupNum;
2179            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2180                if (i >= destCapacity-2) {
2181                    // Never fill the last available output string with capture group text.
2182                    // It will filled with the last field, the remainder of the
2183                    //  unsplit input text.
2184                    break;
2185                }
2186                i++;
2187                dest[i] = group(groupNum, dest[i], status);
2188            }
2189
2190            if (nextOutputStringStart == fActiveLimit) {
2191                // The delimiter was at the end of the string.  We're done, but first
2192                // we output one last empty string, for the empty field following
2193                //   the delimiter at the end of input.
2194                if (i+1 < destCapacity) {
2195                    ++i;
2196                    if (dest[i] == NULL) {
2197                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198                    } else {
2199                        static UChar emptyString[] = {(UChar)0};
2200                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201                    }
2202                }
2203                break;
2204
2205            }
2206        }
2207        else
2208        {
2209            // We ran off the end of the input while looking for the next delimiter.
2210            // All the remaining text goes into the current output string.
2211            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212                if (dest[i]) {
2213                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214                                  input->chunkContents+nextOutputStringStart,
2215                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216                } else {
2217                    UText remainingText = UTEXT_INITIALIZER;
2218                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219                                     fActiveLimit-nextOutputStringStart, &status);
2220                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221                    utext_close(&remainingText);
2222                }
2223            } else {
2224                UErrorCode lengthStatus = U_ZERO_ERROR;
2225                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227                if (remainingChars == NULL) {
2228                    status = U_MEMORY_ALLOCATION_ERROR;
2229                    break;
2230                }
2231
2232                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233                if (dest[i]) {
2234                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235                } else {
2236                    UText remainingText = UTEXT_INITIALIZER;
2237                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239                    utext_close(&remainingText);
2240                }
2241
2242                uprv_free(remainingChars);
2243            }
2244            break;
2245        }
2246        if (U_FAILURE(status)) {
2247            break;
2248        }
2249    }   // end of for loop
2250    return i+1;
2251}
2252
2253
2254//--------------------------------------------------------------------------------
2255//
2256//     start
2257//
2258//--------------------------------------------------------------------------------
2259int32_t RegexMatcher::start(UErrorCode &status) const {
2260    return start(0, status);
2261}
2262
2263int64_t RegexMatcher::start64(UErrorCode &status) const {
2264    return start64(0, status);
2265}
2266
2267//--------------------------------------------------------------------------------
2268//
2269//     start(int32_t group, UErrorCode &status)
2270//
2271//--------------------------------------------------------------------------------
2272
2273int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274    if (U_FAILURE(status)) {
2275        return -1;
2276    }
2277    if (U_FAILURE(fDeferredStatus)) {
2278        status = fDeferredStatus;
2279        return -1;
2280    }
2281    if (fMatch == FALSE) {
2282        status = U_REGEX_INVALID_STATE;
2283        return -1;
2284    }
2285    if (group < 0 || group > fPattern->fGroupMap->size()) {
2286        status = U_INDEX_OUTOFBOUNDS_ERROR;
2287        return -1;
2288    }
2289    int64_t s;
2290    if (group == 0) {
2291        s = fMatchStart;
2292    } else {
2293        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294        U_ASSERT(groupOffset < fPattern->fFrameSize);
2295        U_ASSERT(groupOffset >= 0);
2296        s = fFrame->fExtra[groupOffset];
2297    }
2298
2299    return s;
2300}
2301
2302
2303int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304    return (int32_t)start64(group, status);
2305}
2306
2307//--------------------------------------------------------------------------------
2308//
2309//     useAnchoringBounds
2310//
2311//--------------------------------------------------------------------------------
2312RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313    fAnchoringBounds = b;
2314    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316    return *this;
2317}
2318
2319
2320//--------------------------------------------------------------------------------
2321//
2322//     useTransparentBounds
2323//
2324//--------------------------------------------------------------------------------
2325RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326    fTransparentBounds = b;
2327    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329    return *this;
2330}
2331
2332//--------------------------------------------------------------------------------
2333//
2334//     setTimeLimit
2335//
2336//--------------------------------------------------------------------------------
2337void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338    if (U_FAILURE(status)) {
2339        return;
2340    }
2341    if (U_FAILURE(fDeferredStatus)) {
2342        status = fDeferredStatus;
2343        return;
2344    }
2345    if (limit < 0) {
2346        status = U_ILLEGAL_ARGUMENT_ERROR;
2347        return;
2348    }
2349    fTimeLimit = limit;
2350}
2351
2352
2353//--------------------------------------------------------------------------------
2354//
2355//     getTimeLimit
2356//
2357//--------------------------------------------------------------------------------
2358int32_t RegexMatcher::getTimeLimit() const {
2359    return fTimeLimit;
2360}
2361
2362
2363//--------------------------------------------------------------------------------
2364//
2365//     setStackLimit
2366//
2367//--------------------------------------------------------------------------------
2368void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369    if (U_FAILURE(status)) {
2370        return;
2371    }
2372    if (U_FAILURE(fDeferredStatus)) {
2373        status = fDeferredStatus;
2374        return;
2375    }
2376    if (limit < 0) {
2377        status = U_ILLEGAL_ARGUMENT_ERROR;
2378        return;
2379    }
2380
2381    // Reset the matcher.  This is needed here in case there is a current match
2382    //    whose final stack frame (containing the match results, pointed to by fFrame)
2383    //    would be lost by resizing to a smaller stack size.
2384    reset();
2385
2386    if (limit == 0) {
2387        // Unlimited stack expansion
2388        fStack->setMaxCapacity(0);
2389    } else {
2390        // Change the units of the limit  from bytes to ints, and bump the size up
2391        //   to be big enough to hold at least one stack frame for the pattern,
2392        //   if it isn't there already.
2393        int32_t adjustedLimit = limit / sizeof(int32_t);
2394        if (adjustedLimit < fPattern->fFrameSize) {
2395            adjustedLimit = fPattern->fFrameSize;
2396        }
2397        fStack->setMaxCapacity(adjustedLimit);
2398    }
2399    fStackLimit = limit;
2400}
2401
2402
2403//--------------------------------------------------------------------------------
2404//
2405//     getStackLimit
2406//
2407//--------------------------------------------------------------------------------
2408int32_t RegexMatcher::getStackLimit() const {
2409    return fStackLimit;
2410}
2411
2412
2413//--------------------------------------------------------------------------------
2414//
2415//     setMatchCallback
2416//
2417//--------------------------------------------------------------------------------
2418void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2419                                    const void              *context,
2420                                    UErrorCode              &status) {
2421    if (U_FAILURE(status)) {
2422        return;
2423    }
2424    fCallbackFn = callback;
2425    fCallbackContext = context;
2426}
2427
2428
2429//--------------------------------------------------------------------------------
2430//
2431//     getMatchCallback
2432//
2433//--------------------------------------------------------------------------------
2434void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2435                                  const void              *&context,
2436                                  UErrorCode              &status) {
2437    if (U_FAILURE(status)) {
2438       return;
2439    }
2440    callback = fCallbackFn;
2441    context  = fCallbackContext;
2442}
2443
2444
2445//--------------------------------------------------------------------------------
2446//
2447//     setMatchCallback
2448//
2449//--------------------------------------------------------------------------------
2450void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2451                                                const void                      *context,
2452                                                UErrorCode                      &status) {
2453    if (U_FAILURE(status)) {
2454        return;
2455    }
2456    fFindProgressCallbackFn = callback;
2457    fFindProgressCallbackContext = context;
2458}
2459
2460
2461//--------------------------------------------------------------------------------
2462//
2463//     getMatchCallback
2464//
2465//--------------------------------------------------------------------------------
2466void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2467                                                const void                    *&context,
2468                                                UErrorCode                    &status) {
2469    if (U_FAILURE(status)) {
2470       return;
2471    }
2472    callback = fFindProgressCallbackFn;
2473    context  = fFindProgressCallbackContext;
2474}
2475
2476
2477//================================================================================
2478//
2479//    Code following this point in this file is the internal
2480//    Match Engine Implementation.
2481//
2482//================================================================================
2483
2484
2485//--------------------------------------------------------------------------------
2486//
2487//   resetStack
2488//           Discard any previous contents of the state save stack, and initialize a
2489//           new stack frame to all -1.  The -1s are needed for capture group limits,
2490//           where they indicate that a group has not yet matched anything.
2491//--------------------------------------------------------------------------------
2492REStackFrame *RegexMatcher::resetStack() {
2493    // Discard any previous contents of the state save stack, and initialize a
2494    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2495    //  where they indicate that a group has not yet matched anything.
2496    fStack->removeAllElements();
2497
2498    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499    int32_t i;
2500    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2501        iFrame->fExtra[i] = -1;
2502    }
2503    return iFrame;
2504}
2505
2506
2507
2508//--------------------------------------------------------------------------------
2509//
2510//   isWordBoundary
2511//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2512//                     For us,
2513//                       If the current char is a combining mark,
2514//                          \b is FALSE.
2515//                       Else Scan backwards to the first non-combining char.
2516//                            We are at a boundary if the this char and the original chars are
2517//                               opposite in membership in \w set
2518//
2519//          parameters:   pos   - the current position in the input buffer
2520//
2521//              TODO:  double-check edge cases at region boundaries.
2522//
2523//--------------------------------------------------------------------------------
2524UBool RegexMatcher::isWordBoundary(int64_t pos) {
2525    UBool isBoundary = FALSE;
2526    UBool cIsWord    = FALSE;
2527
2528    if (pos >= fLookLimit) {
2529        fHitEnd = TRUE;
2530    } else {
2531        // Determine whether char c at current position is a member of the word set of chars.
2532        // If we're off the end of the string, behave as though we're not at a word char.
2533        UTEXT_SETNATIVEINDEX(fInputText, pos);
2534        UChar32  c = UTEXT_CURRENT32(fInputText);
2535        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2536            // Current char is a combining one.  Not a boundary.
2537            return FALSE;
2538        }
2539        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2540    }
2541
2542    // Back up until we come to a non-combining char, determine whether
2543    //  that char is a word char.
2544    UBool prevCIsWord = FALSE;
2545    for (;;) {
2546        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2547            break;
2548        }
2549        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2550        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2551              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2552            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2553            break;
2554        }
2555    }
2556    isBoundary = cIsWord ^ prevCIsWord;
2557    return isBoundary;
2558}
2559
2560UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2561    UBool isBoundary = FALSE;
2562    UBool cIsWord    = FALSE;
2563
2564    const UChar *inputBuf = fInputText->chunkContents;
2565
2566    if (pos >= fLookLimit) {
2567        fHitEnd = TRUE;
2568    } else {
2569        // Determine whether char c at current position is a member of the word set of chars.
2570        // If we're off the end of the string, behave as though we're not at a word char.
2571        UChar32 c;
2572        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2573        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2574            // Current char is a combining one.  Not a boundary.
2575            return FALSE;
2576        }
2577        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2578    }
2579
2580    // Back up until we come to a non-combining char, determine whether
2581    //  that char is a word char.
2582    UBool prevCIsWord = FALSE;
2583    for (;;) {
2584        if (pos <= fLookStart) {
2585            break;
2586        }
2587        UChar32 prevChar;
2588        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2589        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2590              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2591            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2592            break;
2593        }
2594    }
2595    isBoundary = cIsWord ^ prevCIsWord;
2596    return isBoundary;
2597}
2598
2599//--------------------------------------------------------------------------------
2600//
2601//   isUWordBoundary
2602//
2603//         Test for a word boundary using RBBI word break.
2604//
2605//          parameters:   pos   - the current position in the input buffer
2606//
2607//--------------------------------------------------------------------------------
2608UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2609    UBool       returnVal = FALSE;
2610#if UCONFIG_NO_BREAK_ITERATION==0
2611
2612    // If we haven't yet created a break iterator for this matcher, do it now.
2613    if (fWordBreakItr == NULL) {
2614        fWordBreakItr =
2615            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2616        if (U_FAILURE(fDeferredStatus)) {
2617            return FALSE;
2618        }
2619        fWordBreakItr->setText(fInputText, fDeferredStatus);
2620    }
2621
2622    if (pos >= fLookLimit) {
2623        fHitEnd = TRUE;
2624        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2625                            //    words are not boundaries.  All non-word chars stand by themselves,
2626                            //    with word boundaries on both sides.
2627    } else {
2628        if (!UTEXT_USES_U16(fInputText)) {
2629            // !!!: Would like a better way to do this!
2630            UErrorCode status = U_ZERO_ERROR;
2631            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2632        }
2633        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2634    }
2635#endif
2636    return   returnVal;
2637}
2638
2639//--------------------------------------------------------------------------------
2640//
2641//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2642//                     saves. Increment the "time" counter, and call the
2643//                     user callback function if there is one installed.
2644//
2645//                     If the match operation needs to be aborted, either for a time-out
2646//                     or because the user callback asked for it, just set an error status.
2647//                     The engine will pick that up and stop in its outer loop.
2648//
2649//--------------------------------------------------------------------------------
2650void RegexMatcher::IncrementTime(UErrorCode &status) {
2651    fTickCounter = TIMER_INITIAL_VALUE;
2652    fTime++;
2653    if (fCallbackFn != NULL) {
2654        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2655            status = U_REGEX_STOPPED_BY_CALLER;
2656            return;
2657        }
2658    }
2659    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2660        status = U_REGEX_TIME_OUT;
2661    }
2662}
2663
2664//--------------------------------------------------------------------------------
2665//
2666//   ReportFindProgress     This function is called once for each advance in the target
2667//                          string from the find() function, and calls the user progress callback
2668//                          function if there is one installed.
2669//
2670//                          NOTE:
2671//
2672//                          If the match operation needs to be aborted because the user
2673//                          callback asked for it, just set an error status.
2674//                          The engine will pick that up and stop in its outer loop.
2675//
2676//--------------------------------------------------------------------------------
2677UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2678    if (fFindProgressCallbackFn != NULL) {
2679        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2680            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2681            return FALSE;
2682        }
2683    }
2684    return TRUE;
2685}
2686
2687//--------------------------------------------------------------------------------
2688//
2689//   StateSave
2690//       Make a new stack frame, initialized as a copy of the current stack frame.
2691//       Set the pattern index in the original stack frame from the operand value
2692//       in the opcode.  Execution of the engine continues with the state in
2693//       the newly created stack frame
2694//
2695//       Note that reserveBlock() may grow the stack, resulting in the
2696//       whole thing being relocated in memory.
2697//
2698//    Parameters:
2699//       fp           The top frame pointer when called.  At return, a new
2700//                    fame will be present
2701//       savePatIdx   An index into the compiled pattern.  Goes into the original
2702//                    (not new) frame.  If execution ever back-tracks out of the
2703//                    new frame, this will be where we continue from in the pattern.
2704//    Return
2705//                    The new frame pointer.
2706//
2707//--------------------------------------------------------------------------------
2708inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2709    // push storage for a new frame.
2710    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2711    if (newFP == NULL) {
2712        // Failure on attempted stack expansion.
2713        //   Stack function set some other error code, change it to a more
2714        //   specific one for regular expressions.
2715        status = U_REGEX_STACK_OVERFLOW;
2716        // We need to return a writable stack frame, so just return the
2717        //    previous frame.  The match operation will stop quickly
2718        //    because of the error status, after which the frame will never
2719        //    be looked at again.
2720        return fp;
2721    }
2722    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2723
2724    // New stack frame = copy of old top frame.
2725    int64_t *source = (int64_t *)fp;
2726    int64_t *dest   = newFP;
2727    for (;;) {
2728        *dest++ = *source++;
2729        if (source == newFP) {
2730            break;
2731        }
2732    }
2733
2734    fTickCounter--;
2735    if (fTickCounter <= 0) {
2736       IncrementTime(status);    // Re-initializes fTickCounter
2737    }
2738    fp->fPatIdx = savePatIdx;
2739    return (REStackFrame *)newFP;
2740}
2741
2742
2743//--------------------------------------------------------------------------------
2744//
2745//   MatchAt      This is the actual matching engine.
2746//
2747//                  startIdx:    begin matching a this index.
2748//                  toEnd:       if true, match must extend to end of the input region
2749//
2750//--------------------------------------------------------------------------------
2751void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752    UBool       isMatch  = FALSE;      // True if the we have a match.
2753
2754    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755
2756    int32_t     op;                    // Operation from the compiled pattern, split into
2757    int32_t     opType;                //    the opcode
2758    int32_t     opValue;               //    and the operand value.
2759
2760    #ifdef REGEX_RUN_DEBUG
2761    if (fTraceDebug)
2762    {
2763        printf("MatchAt(startIdx=%ld)\n", startIdx);
2764        printf("Original Pattern: ");
2765        UChar32 c = utext_next32From(fPattern->fPattern, 0);
2766        while (c != U_SENTINEL) {
2767            if (c<32 || c>256) {
2768                c = '.';
2769            }
2770            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2771
2772            c = UTEXT_NEXT32(fPattern->fPattern);
2773        }
2774        printf("\n");
2775        printf("Input String: ");
2776        c = utext_next32From(fInputText, 0);
2777        while (c != U_SENTINEL) {
2778            if (c<32 || c>256) {
2779                c = '.';
2780            }
2781            printf("%c", c);
2782
2783            c = UTEXT_NEXT32(fInputText);
2784        }
2785        printf("\n");
2786        printf("\n");
2787    }
2788    #endif
2789
2790    if (U_FAILURE(status)) {
2791        return;
2792    }
2793
2794    //  Cache frequently referenced items from the compiled pattern
2795    //
2796    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2797
2798    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2799    UVector             *sets          = fPattern->fSets;
2800
2801    fFrameSize = fPattern->fFrameSize;
2802    REStackFrame        *fp            = resetStack();
2803
2804    fp->fPatIdx   = 0;
2805    fp->fInputIdx = startIdx;
2806
2807    // Zero out the pattern's static data
2808    int32_t i;
2809    for (i = 0; i<fPattern->fDataSize; i++) {
2810        fData[i] = 0;
2811    }
2812
2813    //
2814    //  Main loop for interpreting the compiled pattern.
2815    //  One iteration of the loop per pattern operation performed.
2816    //
2817    for (;;) {
2818#if 0
2819        if (_heapchk() != _HEAPOK) {
2820            fprintf(stderr, "Heap Trouble\n");
2821        }
2822#endif
2823
2824        op      = (int32_t)pat[fp->fPatIdx];
2825        opType  = URX_TYPE(op);
2826        opValue = URX_VAL(op);
2827        #ifdef REGEX_RUN_DEBUG
2828        if (fTraceDebug) {
2829            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2831                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2832            fPattern->dumpOp(fp->fPatIdx);
2833        }
2834        #endif
2835        fp->fPatIdx++;
2836
2837        switch (opType) {
2838
2839
2840        case URX_NOP:
2841            break;
2842
2843
2844        case URX_BACKTRACK:
2845            // Force a backtrack.  In some circumstances, the pattern compiler
2846            //   will notice that the pattern can't possibly match anything, and will
2847            //   emit one of these at that point.
2848            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849            break;
2850
2851
2852        case URX_ONECHAR:
2853            if (fp->fInputIdx < fActiveLimit) {
2854                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2855                UChar32 c = UTEXT_NEXT32(fInputText);
2856                if (c == opValue) {
2857                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2858                    break;
2859                }
2860            } else {
2861                fHitEnd = TRUE;
2862            }
2863            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2864            break;
2865
2866
2867        case URX_STRING:
2868            {
2869                // Test input against a literal string.
2870                // Strings require two slots in the compiled pattern, one for the
2871                //   offset to the string text, and one for the length.
2872
2873                int32_t   stringStartIdx = opValue;
2874                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2875                fp->fPatIdx++;
2876                opType    = URX_TYPE(op);
2877                int32_t stringLen = URX_VAL(op);
2878                U_ASSERT(opType == URX_STRING_LEN);
2879                U_ASSERT(stringLen >= 2);
2880
2881                const UChar *patternString = litText+stringStartIdx;
2882                int32_t patternStringIndex = 0;
2883                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2884                UChar32 inputChar;
2885                UChar32 patternChar;
2886                UBool success = TRUE;
2887                while (patternStringIndex < stringLen) {
2888                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2889                        success = FALSE;
2890                        fHitEnd = TRUE;
2891                        break;
2892                    }
2893                    inputChar = UTEXT_NEXT32(fInputText);
2894                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2895                    if (patternChar != inputChar) {
2896                        success = FALSE;
2897                        break;
2898                    }
2899                }
2900
2901                if (success) {
2902                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2903                } else {
2904                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2905                }
2906            }
2907            break;
2908
2909
2910        case URX_STATE_SAVE:
2911            fp = StateSave(fp, opValue, status);
2912            break;
2913
2914
2915        case URX_END:
2916            // The match loop will exit via this path on a successful match,
2917            //   when we reach the end of the pattern.
2918            if (toEnd && fp->fInputIdx != fActiveLimit) {
2919                // The pattern matched, but not to the end of input.  Try some more.
2920                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2921                break;
2922            }
2923            isMatch = TRUE;
2924            goto  breakFromLoop;
2925
2926        // Start and End Capture stack frame variables are laid out out like this:
2927            //  fp->fExtra[opValue]  - The start of a completed capture group
2928            //             opValue+1 - The end   of a completed capture group
2929            //             opValue+2 - the start of a capture group whose end
2930            //                          has not yet been reached (and might not ever be).
2931        case URX_START_CAPTURE:
2932            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2933            fp->fExtra[opValue+2] = fp->fInputIdx;
2934            break;
2935
2936
2937        case URX_END_CAPTURE:
2938            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2939            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2940            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2941            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2942            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2943            break;
2944
2945
2946        case URX_DOLLAR:                   //  $, test for End of line
2947                                           //     or for position before new line at end of input
2948            {
2949                if (fp->fInputIdx >= fAnchorLimit) {
2950                    // We really are at the end of input.  Success.
2951                    fHitEnd = TRUE;
2952                    fRequireEnd = TRUE;
2953                    break;
2954                }
2955
2956                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2957
2958                // If we are positioned just before a new-line that is located at the
2959                //   end of input, succeed.
2960                UChar32 c = UTEXT_NEXT32(fInputText);
2961                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2962                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2963                        // If not in the middle of a CR/LF sequence
2964                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2965                            // At new-line at end of input. Success
2966                            fHitEnd = TRUE;
2967                            fRequireEnd = TRUE;
2968
2969                            break;
2970                        }
2971                    }
2972                } else {
2973                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2974                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2975                        fHitEnd = TRUE;
2976                        fRequireEnd = TRUE;
2977                        break;                         // At CR/LF at end of input.  Success
2978                    }
2979                }
2980
2981                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2982            }
2983            break;
2984
2985
2986         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2987            if (fp->fInputIdx >= fAnchorLimit) {
2988                // Off the end of input.  Success.
2989                fHitEnd = TRUE;
2990                fRequireEnd = TRUE;
2991                break;
2992            } else {
2993                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2994                UChar32 c = UTEXT_NEXT32(fInputText);
2995                // Either at the last character of input, or off the end.
2996                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2997                    fHitEnd = TRUE;
2998                    fRequireEnd = TRUE;
2999                    break;
3000                }
3001            }
3002
3003            // Not at end of input.  Back-track out.
3004            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3005            break;
3006
3007
3008         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3009             {
3010                 if (fp->fInputIdx >= fAnchorLimit) {
3011                     // We really are at the end of input.  Success.
3012                     fHitEnd = TRUE;
3013                     fRequireEnd = TRUE;
3014                     break;
3015                 }
3016                 // If we are positioned just before a new-line, succeed.
3017                 // It makes no difference where the new-line is within the input.
3018                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019                 UChar32 c = UTEXT_CURRENT32(fInputText);
3020                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3021                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3022                     //  In multi-line mode, hitting a new-line just before the end of input does not
3023                     //   set the hitEnd or requireEnd flags
3024                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3025                        break;
3026                     }
3027                 }
3028                 // not at a new line.  Fail.
3029                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030             }
3031             break;
3032
3033
3034         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3035             {
3036                 if (fp->fInputIdx >= fAnchorLimit) {
3037                     // We really are at the end of input.  Success.
3038                     fHitEnd = TRUE;
3039                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3040                     break;               //   adding a new-line would not lose the match.
3041                 }
3042                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3043                 // It makes no difference where the new-line is within the input.
3044                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3045                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3046                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3047                 }
3048             }
3049             break;
3050
3051
3052       case URX_CARET:                    //  ^, test for start of line
3053            if (fp->fInputIdx != fAnchorStart) {
3054                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3055            }
3056            break;
3057
3058
3059       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3060           {
3061               if (fp->fInputIdx == fAnchorStart) {
3062                   // We are at the start input.  Success.
3063                   break;
3064               }
3065               // Check whether character just before the current pos is a new-line
3066               //   unless we are at the end of input
3067               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3068               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3069               if ((fp->fInputIdx < fAnchorLimit) &&
3070                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3071                   //  It's a new-line.  ^ is true.  Success.
3072                   //  TODO:  what should be done with positions between a CR and LF?
3073                   break;
3074               }
3075               // Not at the start of a line.  Fail.
3076               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077           }
3078           break;
3079
3080
3081       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3082           {
3083               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3084               if (fp->fInputIdx <= fAnchorStart) {
3085                   // We are at the start input.  Success.
3086                   break;
3087               }
3088               // Check whether character just before the current pos is a new-line
3089               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3090               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3091               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3092               if (c != 0x0a) {
3093                   // Not at the start of a line.  Back-track out.
3094                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3095               }
3096           }
3097           break;
3098
3099        case URX_BACKSLASH_B:          // Test for word boundaries
3100            {
3101                UBool success = isWordBoundary(fp->fInputIdx);
3102                success ^= (UBool)(opValue != 0);     // flip sense for \B
3103                if (!success) {
3104                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105                }
3106            }
3107            break;
3108
3109
3110        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3111            {
3112                UBool success = isUWordBoundary(fp->fInputIdx);
3113                success ^= (UBool)(opValue != 0);     // flip sense for \B
3114                if (!success) {
3115                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116                }
3117            }
3118            break;
3119
3120
3121        case URX_BACKSLASH_D:            // Test for decimal digit
3122            {
3123                if (fp->fInputIdx >= fActiveLimit) {
3124                    fHitEnd = TRUE;
3125                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126                    break;
3127                }
3128
3129                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130
3131                UChar32 c = UTEXT_NEXT32(fInputText);
3132                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3133                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3134                success ^= (UBool)(opValue != 0);        // flip sense for \D
3135                if (success) {
3136                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3137                } else {
3138                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139                }
3140            }
3141            break;
3142
3143
3144        case URX_BACKSLASH_G:          // Test for position at end of previous match
3145            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3146                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147            }
3148            break;
3149
3150
3151        case URX_BACKSLASH_X:
3152            //  Match a Grapheme, as defined by Unicode TR 29.
3153            //  Differs slightly from Perl, which consumes combining marks independently
3154            //    of context.
3155            {
3156
3157                // Fail if at end of input
3158                if (fp->fInputIdx >= fActiveLimit) {
3159                    fHitEnd = TRUE;
3160                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161                    break;
3162                }
3163
3164                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3165
3166                // Examine (and consume) the current char.
3167                //   Dispatch into a little state machine, based on the char.
3168                UChar32  c;
3169                c = UTEXT_NEXT32(fInputText);
3170                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3171                UnicodeSet **sets = fPattern->fStaticSets;
3172                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3173                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3174                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3175                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3176                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3177                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3178                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3179                goto GC_Extend;
3180
3181
3182
3183GC_L:
3184                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3185                c = UTEXT_NEXT32(fInputText);
3186                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3188                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3189                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3190                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3191                (void)UTEXT_PREVIOUS32(fInputText);
3192                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3193                goto GC_Extend;
3194
3195GC_V:
3196                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3197                c = UTEXT_NEXT32(fInputText);
3198                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3200                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3201                (void)UTEXT_PREVIOUS32(fInputText);
3202                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3203                goto GC_Extend;
3204
3205GC_T:
3206                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3207                c = UTEXT_NEXT32(fInputText);
3208                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3210                (void)UTEXT_PREVIOUS32(fInputText);
3211                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3212                goto GC_Extend;
3213
3214GC_Extend:
3215                // Combining characters are consumed here
3216                for (;;) {
3217                    if (fp->fInputIdx >= fActiveLimit) {
3218                        break;
3219                    }
3220                    c = UTEXT_CURRENT32(fInputText);
3221                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3222                        break;
3223                    }
3224                    (void)UTEXT_NEXT32(fInputText);
3225                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3226                }
3227                goto GC_Done;
3228
3229GC_Control:
3230                // Most control chars stand alone (don't combine with combining chars),
3231                //   except for that CR/LF sequence is a single grapheme cluster.
3232                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3233                    c = UTEXT_NEXT32(fInputText);
3234                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235                }
3236
3237GC_Done:
3238                if (fp->fInputIdx >= fActiveLimit) {
3239                    fHitEnd = TRUE;
3240                }
3241                break;
3242            }
3243
3244
3245
3246
3247        case URX_BACKSLASH_Z:          // Test for end of Input
3248            if (fp->fInputIdx < fAnchorLimit) {
3249                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250            } else {
3251                fHitEnd = TRUE;
3252                fRequireEnd = TRUE;
3253            }
3254            break;
3255
3256
3257
3258        case URX_STATIC_SETREF:
3259            {
3260                // Test input character against one of the predefined sets
3261                //    (Word Characters, for example)
3262                // The high bit of the op value is a flag for the match polarity.
3263                //    0:   success if input char is in set.
3264                //    1:   success if input char is not in set.
3265                if (fp->fInputIdx >= fActiveLimit) {
3266                    fHitEnd = TRUE;
3267                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3268                    break;
3269                }
3270
3271                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3272                opValue &= ~URX_NEG_SET;
3273                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3274
3275                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3276                UChar32 c = UTEXT_NEXT32(fInputText);
3277                if (c < 256) {
3278                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3279                    if (s8->contains(c)) {
3280                        success = !success;
3281                    }
3282                } else {
3283                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3284                    if (s->contains(c)) {
3285                        success = !success;
3286                    }
3287                }
3288                if (success) {
3289                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3290                } else {
3291                    // the character wasn't in the set.
3292                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293                }
3294            }
3295            break;
3296
3297
3298        case URX_STAT_SETREF_N:
3299            {
3300                // Test input character for NOT being a member of  one of
3301                //    the predefined sets (Word Characters, for example)
3302                if (fp->fInputIdx >= fActiveLimit) {
3303                    fHitEnd = TRUE;
3304                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305                    break;
3306                }
3307
3308                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309
3310                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311
3312                UChar32 c = UTEXT_NEXT32(fInputText);
3313                if (c < 256) {
3314                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315                    if (s8->contains(c) == FALSE) {
3316                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3317                        break;
3318                    }
3319                } else {
3320                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321                    if (s->contains(c) == FALSE) {
3322                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3323                        break;
3324                    }
3325                }
3326                // the character wasn't in the set.
3327                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328            }
3329            break;
3330
3331
3332        case URX_SETREF:
3333            if (fp->fInputIdx >= fActiveLimit) {
3334                fHitEnd = TRUE;
3335                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336                break;
3337            } else {
3338                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3339
3340                // There is input left.  Pick up one char and test it for set membership.
3341                UChar32 c = UTEXT_NEXT32(fInputText);
3342                U_ASSERT(opValue > 0 && opValue < sets->size());
3343                if (c<256) {
3344                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3345                    if (s8->contains(c)) {
3346                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3347                        break;
3348                    }
3349                } else {
3350                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3351                    if (s->contains(c)) {
3352                        // The character is in the set.  A Match.
3353                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354                        break;
3355                    }
3356                }
3357
3358                // the character wasn't in the set.
3359                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3360            }
3361            break;
3362
3363
3364        case URX_DOTANY:
3365            {
3366                // . matches anything, but stops at end-of-line.
3367                if (fp->fInputIdx >= fActiveLimit) {
3368                    // At end of input.  Match failed.  Backtrack out.
3369                    fHitEnd = TRUE;
3370                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371                    break;
3372                }
3373
3374                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3375
3376                // There is input left.  Advance over one char, unless we've hit end-of-line
3377                UChar32 c = UTEXT_NEXT32(fInputText);
3378                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3379                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3380                    // End of line in normal mode.   . does not match.
3381                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382                    break;
3383                }
3384                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3385            }
3386            break;
3387
3388
3389        case URX_DOTANY_ALL:
3390            {
3391                // ., in dot-matches-all (including new lines) mode
3392                if (fp->fInputIdx >= fActiveLimit) {
3393                    // At end of input.  Match failed.  Backtrack out.
3394                    fHitEnd = TRUE;
3395                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396                    break;
3397                }
3398
3399                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3400
3401                // There is input left.  Advance over one char, except if we are
3402                //   at a cr/lf, advance over both of them.
3403                UChar32 c;
3404                c = UTEXT_NEXT32(fInputText);
3405                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3407                    // In the case of a CR/LF, we need to advance over both.
3408                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3409                    if (nextc == 0x0a) {
3410                        (void)UTEXT_NEXT32(fInputText);
3411                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3412                    }
3413                }
3414            }
3415            break;
3416
3417
3418        case URX_DOTANY_UNIX:
3419            {
3420                // '.' operator, matches all, but stops at end-of-line.
3421                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3422                if (fp->fInputIdx >= fActiveLimit) {
3423                    // At end of input.  Match failed.  Backtrack out.
3424                    fHitEnd = TRUE;
3425                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3426                    break;
3427                }
3428
3429                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3430
3431                // There is input left.  Advance over one char, unless we've hit end-of-line
3432                UChar32 c = UTEXT_NEXT32(fInputText);
3433                if (c == 0x0a) {
3434                    // End of line in normal mode.   '.' does not match the \n
3435                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3436                } else {
3437                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3438                }
3439            }
3440            break;
3441
3442
3443        case URX_JMP:
3444            fp->fPatIdx = opValue;
3445            break;
3446
3447        case URX_FAIL:
3448            isMatch = FALSE;
3449            goto breakFromLoop;
3450
3451        case URX_JMP_SAV:
3452            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3453            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3454            fp->fPatIdx = opValue;                         // Then JMP.
3455            break;
3456
3457        case URX_JMP_SAV_X:
3458            // This opcode is used with (x)+, when x can match a zero length string.
3459            // Same as JMP_SAV, except conditional on the match having made forward progress.
3460            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3461            //   data address of the input position at the start of the loop.
3462            {
3463                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3464                int32_t  stoOp = (int32_t)pat[opValue-1];
3465                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3466                int32_t  frameLoc = URX_VAL(stoOp);
3467                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3468                int64_t prevInputIdx = fp->fExtra[frameLoc];
3469                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3470                if (prevInputIdx < fp->fInputIdx) {
3471                    // The match did make progress.  Repeat the loop.
3472                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3473                    fp->fPatIdx = opValue;
3474                    fp->fExtra[frameLoc] = fp->fInputIdx;
3475                }
3476                // If the input position did not advance, we do nothing here,
3477                //   execution will fall out of the loop.
3478            }
3479            break;
3480
3481        case URX_CTR_INIT:
3482            {
3483                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3484                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3485
3486                // Pick up the three extra operands that CTR_INIT has, and
3487                //    skip the pattern location counter past
3488                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3489                fp->fPatIdx += 3;
3490                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3491                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3492                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3493                U_ASSERT(minCount>=0);
3494                U_ASSERT(maxCount>=minCount || maxCount==-1);
3495                U_ASSERT(loopLoc>fp->fPatIdx);
3496
3497                if (minCount == 0) {
3498                    fp = StateSave(fp, loopLoc+1, status);
3499                }
3500                if (maxCount == 0) {
3501                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3502                }
3503            }
3504            break;
3505
3506        case URX_CTR_LOOP:
3507            {
3508                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3509                int32_t initOp = (int32_t)pat[opValue];
3510                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3511                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3512                int32_t minCount  = (int32_t)pat[opValue+2];
3513                int32_t maxCount  = (int32_t)pat[opValue+3];
3514                // Increment the counter.  Note: we DIDN'T worry about counter
3515                //   overflow, since the data comes from UnicodeStrings, which
3516                //   stores its length in an int32_t. Do we have to think about
3517                //   this now that we're using UText? Probably not, since the length
3518                //   in UChar32s is still an int32_t.
3519                (*pCounter)++;
3520                U_ASSERT(*pCounter > 0);
3521                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3522                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3523                    break;
3524                }
3525                if (*pCounter >= minCount) {
3526                    fp = StateSave(fp, fp->fPatIdx, status);
3527                }
3528                fp->fPatIdx = opValue + 4;    // Loop back.
3529            }
3530            break;
3531
3532        case URX_CTR_INIT_NG:
3533            {
3534                // Initialize a non-greedy loop
3535                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3536                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3537
3538                // Pick up the three extra operands that CTR_INIT has, and
3539                //    skip the pattern location counter past
3540                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3541                fp->fPatIdx += 3;
3542                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3543                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3544                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3545                U_ASSERT(minCount>=0);
3546                U_ASSERT(maxCount>=minCount || maxCount==-1);
3547                U_ASSERT(loopLoc>fp->fPatIdx);
3548
3549                if (minCount == 0) {
3550                    if (maxCount != 0) {
3551                        fp = StateSave(fp, fp->fPatIdx, status);
3552                    }
3553                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3554                }
3555            }
3556            break;
3557
3558        case URX_CTR_LOOP_NG:
3559            {
3560                // Non-greedy {min, max} loops
3561                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3562                int32_t initOp = (int32_t)pat[opValue];
3563                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3564                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3565                int32_t minCount  = (int32_t)pat[opValue+2];
3566                int32_t maxCount  = (int32_t)pat[opValue+3];
3567                // Increment the counter.  Note: we DIDN'T worry about counter
3568                //   overflow, since the data comes from UnicodeStrings, which
3569                //   stores its length in an int32_t. Do we have to think about
3570                //   this now that we're using UText? Probably not, since the length
3571                //   in UChar32s is still an int32_t.
3572                (*pCounter)++;
3573                U_ASSERT(*pCounter > 0);
3574
3575                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3576                    // The loop has matched the maximum permitted number of times.
3577                    //   Break out of here with no action.  Matching will
3578                    //   continue with the following pattern.
3579                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3580                    break;
3581                }
3582
3583                if (*pCounter < minCount) {
3584                    // We haven't met the minimum number of matches yet.
3585                    //   Loop back for another one.
3586                    fp->fPatIdx = opValue + 4;    // Loop back.
3587                } else {
3588                    // We do have the minimum number of matches.
3589                    //   Fall into the following pattern, but first do
3590                    //   a state save to the top of the loop, so that a failure
3591                    //   in the following pattern will try another iteration of the loop.
3592                    fp = StateSave(fp, opValue + 4, status);
3593                }
3594            }
3595            break;
3596
3597        case URX_STO_SP:
3598            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3599            fData[opValue] = fStack->size();
3600            break;
3601
3602        case URX_LD_SP:
3603            {
3604                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3605                int32_t newStackSize = (int32_t)fData[opValue];
3606                U_ASSERT(newStackSize <= fStack->size());
3607                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3608                if (newFP == (int64_t *)fp) {
3609                    break;
3610                }
3611                int32_t i;
3612                for (i=0; i<fFrameSize; i++) {
3613                    newFP[i] = ((int64_t *)fp)[i];
3614                }
3615                fp = (REStackFrame *)newFP;
3616                fStack->setSize(newStackSize);
3617            }
3618            break;
3619
3620        case URX_BACKREF:
3621            {
3622                U_ASSERT(opValue < fFrameSize);
3623                int64_t groupStartIdx = fp->fExtra[opValue];
3624                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3625                U_ASSERT(groupStartIdx <= groupEndIdx);
3626                if (groupStartIdx < 0) {
3627                    // This capture group has not participated in the match thus far,
3628                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3629                    break;
3630                }
3631                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3632                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3633
3634                //   Note: if the capture group match was of an empty string the backref
3635                //         match succeeds.  Verified by testing:  Perl matches succeed
3636                //         in this case, so we do too.
3637
3638                UBool success = TRUE;
3639                for (;;) {
3640                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3641                        success = TRUE;
3642                        break;
3643                    }
3644                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3645                        success = FALSE;
3646                        fHitEnd = TRUE;
3647                        break;
3648                    }
3649                    UChar32 captureGroupChar = utext_next32(fAltInputText);
3650                    UChar32 inputChar = utext_next32(fInputText);
3651                    if (inputChar != captureGroupChar) {
3652                        success = FALSE;
3653                        break;
3654                    }
3655                }
3656
3657                if (success) {
3658                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3659                } else {
3660                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3661                }
3662            }
3663            break;
3664
3665
3666
3667        case URX_BACKREF_I:
3668            {
3669                U_ASSERT(opValue < fFrameSize);
3670                int64_t groupStartIdx = fp->fExtra[opValue];
3671                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3672                U_ASSERT(groupStartIdx <= groupEndIdx);
3673                if (groupStartIdx < 0) {
3674                    // This capture group has not participated in the match thus far,
3675                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3676                    break;
3677                }
3678                utext_setNativeIndex(fAltInputText, groupStartIdx);
3679                utext_setNativeIndex(fInputText, fp->fInputIdx);
3680                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3681                CaseFoldingUTextIterator inputItr(*fInputText);
3682
3683                //   Note: if the capture group match was of an empty string the backref
3684                //         match succeeds.  Verified by testing:  Perl matches succeed
3685                //         in this case, so we do too.
3686
3687                UBool success = TRUE;
3688                for (;;) {
3689                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3690                        success = TRUE;
3691                        break;
3692                    }
3693                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3694                        success = FALSE;
3695                        fHitEnd = TRUE;
3696                        break;
3697                    }
3698                    UChar32 captureGroupChar = captureGroupItr.next();
3699                    UChar32 inputChar = inputItr.next();
3700                    if (inputChar != captureGroupChar) {
3701                        success = FALSE;
3702                        break;
3703                    }
3704                }
3705
3706                if (success && inputItr.inExpansion()) {
3707                    // We otained a match by consuming part of a string obtained from
3708                    // case-folding a single code point of the input text.
3709                    // This does not count as an overall match.
3710                    success = FALSE;
3711                }
3712
3713                if (success) {
3714                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3715                } else {
3716                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3717                }
3718
3719            }
3720            break;
3721
3722        case URX_STO_INP_LOC:
3723            {
3724                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3725                fp->fExtra[opValue] = fp->fInputIdx;
3726            }
3727            break;
3728
3729        case URX_JMPX:
3730            {
3731                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3732                fp->fPatIdx += 1;
3733                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3734                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3735                int64_t savedInputIdx = fp->fExtra[dataLoc];
3736                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3737                if (savedInputIdx < fp->fInputIdx) {
3738                    fp->fPatIdx = opValue;                               // JMP
3739                } else {
3740                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3741                }
3742            }
3743            break;
3744
3745        case URX_LA_START:
3746            {
3747                // Entering a lookahead block.
3748                // Save Stack Ptr, Input Pos.
3749                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3750                fData[opValue]   = fStack->size();
3751                fData[opValue+1] = fp->fInputIdx;
3752                fActiveStart     = fLookStart;          // Set the match region change for
3753                fActiveLimit     = fLookLimit;          //   transparent bounds.
3754            }
3755            break;
3756
3757        case URX_LA_END:
3758            {
3759                // Leaving a look-ahead block.
3760                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3761                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3762                int32_t stackSize = fStack->size();
3763                int32_t newStackSize =(int32_t)fData[opValue];
3764                U_ASSERT(stackSize >= newStackSize);
3765                if (stackSize > newStackSize) {
3766                    // Copy the current top frame back to the new (cut back) top frame.
3767                    //   This makes the capture groups from within the look-ahead
3768                    //   expression available.
3769                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3770                    int32_t i;
3771                    for (i=0; i<fFrameSize; i++) {
3772                        newFP[i] = ((int64_t *)fp)[i];
3773                    }
3774                    fp = (REStackFrame *)newFP;
3775                    fStack->setSize(newStackSize);
3776                }
3777                fp->fInputIdx = fData[opValue+1];
3778
3779                // Restore the active region bounds in the input string; they may have
3780                //    been changed because of transparent bounds on a Region.
3781                fActiveStart = fRegionStart;
3782                fActiveLimit = fRegionLimit;
3783            }
3784            break;
3785
3786        case URX_ONECHAR_I:
3787            // Case insensitive one char.  The char from the pattern is already case folded.
3788            // Input text is not, but case folding the input can not reduce two or more code
3789            // points to one.
3790            if (fp->fInputIdx < fActiveLimit) {
3791                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3792
3793                UChar32 c = UTEXT_NEXT32(fInputText);
3794                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3795                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3796                    break;
3797                }
3798            } else {
3799                fHitEnd = TRUE;
3800            }
3801
3802            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3803            break;
3804
3805        case URX_STRING_I:
3806            {
3807                // Case-insensitive test input against a literal string.
3808                // Strings require two slots in the compiled pattern, one for the
3809                //   offset to the string text, and one for the length.
3810                //   The compiled string has already been case folded.
3811                {
3812                    const UChar *patternString = litText + opValue;
3813                    int32_t      patternStringIdx  = 0;
3814
3815                    op      = (int32_t)pat[fp->fPatIdx];
3816                    fp->fPatIdx++;
3817                    opType  = URX_TYPE(op);
3818                    opValue = URX_VAL(op);
3819                    U_ASSERT(opType == URX_STRING_LEN);
3820                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3821
3822
3823                    UChar32   cPattern;
3824                    UChar32   cText;
3825                    UBool     success = TRUE;
3826
3827                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3828                    CaseFoldingUTextIterator inputIterator(*fInputText);
3829                    while (patternStringIdx < patternStringLen) {
3830                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3831                            success = FALSE;
3832                            fHitEnd = TRUE;
3833                            break;
3834                        }
3835                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3836                        cText = inputIterator.next();
3837                        if (cText != cPattern) {
3838                            success = FALSE;
3839                            break;
3840                        }
3841                    }
3842                    if (inputIterator.inExpansion()) {
3843                        success = FALSE;
3844                    }
3845
3846                    if (success) {
3847                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3848                    } else {
3849                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3850                    }
3851                }
3852            }
3853            break;
3854
3855        case URX_LB_START:
3856            {
3857                // Entering a look-behind block.
3858                // Save Stack Ptr, Input Pos.
3859                //   TODO:  implement transparent bounds.  Ticket #6067
3860                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861                fData[opValue]   = fStack->size();
3862                fData[opValue+1] = fp->fInputIdx;
3863                // Init the variable containing the start index for attempted matches.
3864                fData[opValue+2] = -1;
3865                // Save input string length, then reset to pin any matches to end at
3866                //   the current position.
3867                fData[opValue+3] = fActiveLimit;
3868                fActiveLimit     = fp->fInputIdx;
3869            }
3870            break;
3871
3872
3873        case URX_LB_CONT:
3874            {
3875                // Positive Look-Behind, at top of loop checking for matches of LB expression
3876                //    at all possible input starting positions.
3877
3878                // Fetch the min and max possible match lengths.  They are the operands
3879                //   of this op in the pattern.
3880                int32_t minML = (int32_t)pat[fp->fPatIdx++];
3881                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3882                U_ASSERT(minML <= maxML);
3883                U_ASSERT(minML >= 0);
3884
3885                // Fetch (from data) the last input index where a match was attempted.
3886                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3887                int64_t  *lbStartIdx = &fData[opValue+2];
3888                if (*lbStartIdx < 0) {
3889                    // First time through loop.
3890                    *lbStartIdx = fp->fInputIdx - minML;
3891                } else {
3892                    // 2nd through nth time through the loop.
3893                    // Back up start position for match by one.
3894                    if (*lbStartIdx == 0) {
3895                        (*lbStartIdx)--;
3896                    } else {
3897                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3898                        (void)UTEXT_PREVIOUS32(fInputText);
3899                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900                    }
3901                }
3902
3903                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3904                    // We have tried all potential match starting points without
3905                    //  getting a match.  Backtrack out, and out of the
3906                    //   Look Behind altogether.
3907                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3908                    int64_t restoreInputLen = fData[opValue+3];
3909                    U_ASSERT(restoreInputLen >= fActiveLimit);
3910                    U_ASSERT(restoreInputLen <= fInputLength);
3911                    fActiveLimit = restoreInputLen;
3912                    break;
3913                }
3914
3915                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3916                //      (successful match will fall off the end of the loop.)
3917                fp = StateSave(fp, fp->fPatIdx-3, status);
3918                fp->fInputIdx = *lbStartIdx;
3919            }
3920            break;
3921
3922        case URX_LB_END:
3923            // End of a look-behind block, after a successful match.
3924            {
3925                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3926                if (fp->fInputIdx != fActiveLimit) {
3927                    //  The look-behind expression matched, but the match did not
3928                    //    extend all the way to the point that we are looking behind from.
3929                    //  FAIL out of here, which will take us back to the LB_CONT, which
3930                    //     will retry the match starting at another position or fail
3931                    //     the look-behind altogether, whichever is appropriate.
3932                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3933                    break;
3934                }
3935
3936                // Look-behind match is good.  Restore the orignal input string length,
3937                //   which had been truncated to pin the end of the lookbehind match to the
3938                //   position being looked-behind.
3939                int64_t originalInputLen = fData[opValue+3];
3940                U_ASSERT(originalInputLen >= fActiveLimit);
3941                U_ASSERT(originalInputLen <= fInputLength);
3942                fActiveLimit = originalInputLen;
3943            }
3944            break;
3945
3946
3947        case URX_LBN_CONT:
3948            {
3949                // Negative Look-Behind, at top of loop checking for matches of LB expression
3950                //    at all possible input starting positions.
3951
3952                // Fetch the extra parameters of this op.
3953                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3954                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3955                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3956                        continueLoc = URX_VAL(continueLoc);
3957                U_ASSERT(minML <= maxML);
3958                U_ASSERT(minML >= 0);
3959                U_ASSERT(continueLoc > fp->fPatIdx);
3960
3961                // Fetch (from data) the last input index where a match was attempted.
3962                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3963                int64_t  *lbStartIdx = &fData[opValue+2];
3964                if (*lbStartIdx < 0) {
3965                    // First time through loop.
3966                    *lbStartIdx = fp->fInputIdx - minML;
3967                } else {
3968                    // 2nd through nth time through the loop.
3969                    // Back up start position for match by one.
3970                    if (*lbStartIdx == 0) {
3971                        (*lbStartIdx)--;
3972                    } else {
3973                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3974                        (void)UTEXT_PREVIOUS32(fInputText);
3975                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3976                    }
3977                }
3978
3979                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3980                    // We have tried all potential match starting points without
3981                    //  getting a match, which means that the negative lookbehind as
3982                    //  a whole has succeeded.  Jump forward to the continue location
3983                    int64_t restoreInputLen = fData[opValue+3];
3984                    U_ASSERT(restoreInputLen >= fActiveLimit);
3985                    U_ASSERT(restoreInputLen <= fInputLength);
3986                    fActiveLimit = restoreInputLen;
3987                    fp->fPatIdx = continueLoc;
3988                    break;
3989                }
3990
3991                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3992                //      (successful match will cause a FAIL out of the loop altogether.)
3993                fp = StateSave(fp, fp->fPatIdx-4, status);
3994                fp->fInputIdx = *lbStartIdx;
3995            }
3996            break;
3997
3998        case URX_LBN_END:
3999            // End of a negative look-behind block, after a successful match.
4000            {
4001                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4002                if (fp->fInputIdx != fActiveLimit) {
4003                    //  The look-behind expression matched, but the match did not
4004                    //    extend all the way to the point that we are looking behind from.
4005                    //  FAIL out of here, which will take us back to the LB_CONT, which
4006                    //     will retry the match starting at another position or succeed
4007                    //     the look-behind altogether, whichever is appropriate.
4008                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4009                    break;
4010                }
4011
4012                // Look-behind expression matched, which means look-behind test as
4013                //   a whole Fails
4014
4015                //   Restore the orignal input string length, which had been truncated
4016                //   inorder to pin the end of the lookbehind match
4017                //   to the position being looked-behind.
4018                int64_t originalInputLen = fData[opValue+3];
4019                U_ASSERT(originalInputLen >= fActiveLimit);
4020                U_ASSERT(originalInputLen <= fInputLength);
4021                fActiveLimit = originalInputLen;
4022
4023                // Restore original stack position, discarding any state saved
4024                //   by the successful pattern match.
4025                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4026                int32_t newStackSize = (int32_t)fData[opValue];
4027                U_ASSERT(fStack->size() > newStackSize);
4028                fStack->setSize(newStackSize);
4029
4030                //  FAIL, which will take control back to someplace
4031                //  prior to entering the look-behind test.
4032                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4033            }
4034            break;
4035
4036
4037        case URX_LOOP_SR_I:
4038            // Loop Initialization for the optimized implementation of
4039            //     [some character set]*
4040            //   This op scans through all matching input.
4041            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4042            {
4043                U_ASSERT(opValue > 0 && opValue < sets->size());
4044                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4045                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4046
4047                // Loop through input, until either the input is exhausted or
4048                //   we reach a character that is not a member of the set.
4049                int64_t ix = fp->fInputIdx;
4050                UTEXT_SETNATIVEINDEX(fInputText, ix);
4051                for (;;) {
4052                    if (ix >= fActiveLimit) {
4053                        fHitEnd = TRUE;
4054                        break;
4055                    }
4056                    UChar32 c = UTEXT_NEXT32(fInputText);
4057                    if (c<256) {
4058                        if (s8->contains(c) == FALSE) {
4059                            break;
4060                        }
4061                    } else {
4062                        if (s->contains(c) == FALSE) {
4063                            break;
4064                        }
4065                    }
4066                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4067                }
4068
4069                // If there were no matching characters, skip over the loop altogether.
4070                //   The loop doesn't run at all, a * op always succeeds.
4071                if (ix == fp->fInputIdx) {
4072                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4073                    break;
4074                }
4075
4076                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4077                //   must follow.  It's operand is the stack location
4078                //   that holds the starting input index for the match of this [set]*
4079                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4080                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4081                int32_t stackLoc = URX_VAL(loopcOp);
4082                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4083                fp->fExtra[stackLoc] = fp->fInputIdx;
4084                fp->fInputIdx = ix;
4085
4086                // Save State to the URX_LOOP_C op that follows this one,
4087                //   so that match failures in the following code will return to there.
4088                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4089                fp = StateSave(fp, fp->fPatIdx, status);
4090                fp->fPatIdx++;
4091            }
4092            break;
4093
4094
4095        case URX_LOOP_DOT_I:
4096            // Loop Initialization for the optimized implementation of .*
4097            //   This op scans through all remaining input.
4098            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4099            {
4100                // Loop through input until the input is exhausted (we reach an end-of-line)
4101                // In DOTALL mode, we can just go straight to the end of the input.
4102                int64_t ix;
4103                if ((opValue & 1) == 1) {
4104                    // Dot-matches-All mode.  Jump straight to the end of the string.
4105                    ix = fActiveLimit;
4106                    fHitEnd = TRUE;
4107                } else {
4108                    // NOT DOT ALL mode.  Line endings do not match '.'
4109                    // Scan forward until a line ending or end of input.
4110                    ix = fp->fInputIdx;
4111                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4112                    for (;;) {
4113                        if (ix >= fActiveLimit) {
4114                            fHitEnd = TRUE;
4115                            break;
4116                        }
4117                        UChar32 c = UTEXT_NEXT32(fInputText);
4118                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4119                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4120                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4121                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4122                                //  char is a line ending.  Exit the scanning loop.
4123                                break;
4124                            }
4125                        }
4126                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4127                    }
4128                }
4129
4130                // If there were no matching characters, skip over the loop altogether.
4131                //   The loop doesn't run at all, a * op always succeeds.
4132                if (ix == fp->fInputIdx) {
4133                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4134                    break;
4135                }
4136
4137                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4138                //   must follow.  It's operand is the stack location
4139                //   that holds the starting input index for the match of this .*
4140                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4141                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4142                int32_t stackLoc = URX_VAL(loopcOp);
4143                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4144                fp->fExtra[stackLoc] = fp->fInputIdx;
4145                fp->fInputIdx = ix;
4146
4147                // Save State to the URX_LOOP_C op that follows this one,
4148                //   so that match failures in the following code will return to there.
4149                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4150                fp = StateSave(fp, fp->fPatIdx, status);
4151                fp->fPatIdx++;
4152            }
4153            break;
4154
4155
4156        case URX_LOOP_C:
4157            {
4158                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4159                backSearchIndex = fp->fExtra[opValue];
4160                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4161                if (backSearchIndex == fp->fInputIdx) {
4162                    // We've backed up the input idx to the point that the loop started.
4163                    // The loop is done.  Leave here without saving state.
4164                    //  Subsequent failures won't come back here.
4165                    break;
4166                }
4167                // Set up for the next iteration of the loop, with input index
4168                //   backed up by one from the last time through,
4169                //   and a state save to this instruction in case the following code fails again.
4170                //   (We're going backwards because this loop emulates stack unwinding, not
4171                //    the initial scan forward.)
4172                U_ASSERT(fp->fInputIdx > 0);
4173                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4174                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4175                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4176
4177                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4178                if (prevC == 0x0a &&
4179                    fp->fInputIdx > backSearchIndex &&
4180                    twoPrevC == 0x0d) {
4181                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4182                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4183                        // .*, stepping back over CRLF pair.
4184                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4185                    }
4186                }
4187
4188
4189                fp = StateSave(fp, fp->fPatIdx-1, status);
4190            }
4191            break;
4192
4193
4194
4195        default:
4196            // Trouble.  The compiled pattern contains an entry with an
4197            //           unrecognized type tag.
4198            U_ASSERT(FALSE);
4199        }
4200
4201        if (U_FAILURE(status)) {
4202            isMatch = FALSE;
4203            break;
4204        }
4205    }
4206
4207breakFromLoop:
4208    fMatch = isMatch;
4209    if (isMatch) {
4210        fLastMatchEnd = fMatchEnd;
4211        fMatchStart   = startIdx;
4212        fMatchEnd     = fp->fInputIdx;
4213        if (fTraceDebug) {
4214            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4215        }
4216    }
4217    else
4218    {
4219        if (fTraceDebug) {
4220            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4221        }
4222    }
4223
4224    fFrame = fp;                // The active stack frame when the engine stopped.
4225                                //   Contains the capture group results that we need to
4226                                //    access later.
4227    return;
4228}
4229
4230
4231//--------------------------------------------------------------------------------
4232//
4233//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4234//                  assumption that the entire string is available in the UText's
4235//                  chunk buffer. For now, that means we can use int32_t indexes,
4236//                  except for anything that needs to be saved (like group starts
4237//                  and ends).
4238//
4239//                  startIdx:    begin matching a this index.
4240//                  toEnd:       if true, match must extend to end of the input region
4241//
4242//--------------------------------------------------------------------------------
4243void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4244    UBool       isMatch  = FALSE;      // True if the we have a match.
4245
4246    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4247
4248    int32_t     op;                    // Operation from the compiled pattern, split into
4249    int32_t     opType;                //    the opcode
4250    int32_t     opValue;               //    and the operand value.
4251
4252#ifdef REGEX_RUN_DEBUG
4253    if (fTraceDebug)
4254    {
4255        printf("MatchAt(startIdx=%ld)\n", startIdx);
4256        printf("Original Pattern: ");
4257        UChar32 c = utext_next32From(fPattern->fPattern, 0);
4258        while (c != U_SENTINEL) {
4259            if (c<32 || c>256) {
4260                c = '.';
4261            }
4262            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4263
4264            c = UTEXT_NEXT32(fPattern->fPattern);
4265        }
4266        printf("\n");
4267        printf("Input String: ");
4268        c = utext_next32From(fInputText, 0);
4269        while (c != U_SENTINEL) {
4270            if (c<32 || c>256) {
4271                c = '.';
4272            }
4273            printf("%c", c);
4274
4275            c = UTEXT_NEXT32(fInputText);
4276        }
4277        printf("\n");
4278        printf("\n");
4279    }
4280#endif
4281
4282    if (U_FAILURE(status)) {
4283        return;
4284    }
4285
4286    //  Cache frequently referenced items from the compiled pattern
4287    //
4288    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4289
4290    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4291    UVector             *sets          = fPattern->fSets;
4292
4293    const UChar         *inputBuf      = fInputText->chunkContents;
4294
4295    fFrameSize = fPattern->fFrameSize;
4296    REStackFrame        *fp            = resetStack();
4297
4298    fp->fPatIdx   = 0;
4299    fp->fInputIdx = startIdx;
4300
4301    // Zero out the pattern's static data
4302    int32_t i;
4303    for (i = 0; i<fPattern->fDataSize; i++) {
4304        fData[i] = 0;
4305    }
4306
4307    //
4308    //  Main loop for interpreting the compiled pattern.
4309    //  One iteration of the loop per pattern operation performed.
4310    //
4311    for (;;) {
4312#if 0
4313        if (_heapchk() != _HEAPOK) {
4314            fprintf(stderr, "Heap Trouble\n");
4315        }
4316#endif
4317
4318        op      = (int32_t)pat[fp->fPatIdx];
4319        opType  = URX_TYPE(op);
4320        opValue = URX_VAL(op);
4321#ifdef REGEX_RUN_DEBUG
4322        if (fTraceDebug) {
4323            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4324            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4325                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4326            fPattern->dumpOp(fp->fPatIdx);
4327        }
4328#endif
4329        fp->fPatIdx++;
4330
4331        switch (opType) {
4332
4333
4334        case URX_NOP:
4335            break;
4336
4337
4338        case URX_BACKTRACK:
4339            // Force a backtrack.  In some circumstances, the pattern compiler
4340            //   will notice that the pattern can't possibly match anything, and will
4341            //   emit one of these at that point.
4342            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4343            break;
4344
4345
4346        case URX_ONECHAR:
4347            if (fp->fInputIdx < fActiveLimit) {
4348                UChar32 c;
4349                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4350                if (c == opValue) {
4351                    break;
4352                }
4353            } else {
4354                fHitEnd = TRUE;
4355            }
4356            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4357            break;
4358
4359
4360        case URX_STRING:
4361            {
4362                // Test input against a literal string.
4363                // Strings require two slots in the compiled pattern, one for the
4364                //   offset to the string text, and one for the length.
4365                int32_t   stringStartIdx = opValue;
4366                int32_t   stringLen;
4367
4368                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4369                fp->fPatIdx++;
4370                opType    = URX_TYPE(op);
4371                stringLen = URX_VAL(op);
4372                U_ASSERT(opType == URX_STRING_LEN);
4373                U_ASSERT(stringLen >= 2);
4374
4375                const UChar * pInp = inputBuf + fp->fInputIdx;
4376                const UChar * pInpLimit = inputBuf + fActiveLimit;
4377                const UChar * pPat = litText+stringStartIdx;
4378                const UChar * pEnd = pInp + stringLen;
4379                UBool success = TRUE;
4380                while (pInp < pEnd) {
4381                    if (pInp >= pInpLimit) {
4382                        fHitEnd = TRUE;
4383                        success = FALSE;
4384                        break;
4385                    }
4386                    if (*pInp++ != *pPat++) {
4387                        success = FALSE;
4388                        break;
4389                    }
4390                }
4391
4392                if (success) {
4393                    fp->fInputIdx += stringLen;
4394                } else {
4395                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4396                }
4397            }
4398            break;
4399
4400
4401        case URX_STATE_SAVE:
4402            fp = StateSave(fp, opValue, status);
4403            break;
4404
4405
4406        case URX_END:
4407            // The match loop will exit via this path on a successful match,
4408            //   when we reach the end of the pattern.
4409            if (toEnd && fp->fInputIdx != fActiveLimit) {
4410                // The pattern matched, but not to the end of input.  Try some more.
4411                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412                break;
4413            }
4414            isMatch = TRUE;
4415            goto  breakFromLoop;
4416
4417            // Start and End Capture stack frame variables are laid out out like this:
4418            //  fp->fExtra[opValue]  - The start of a completed capture group
4419            //             opValue+1 - The end   of a completed capture group
4420            //             opValue+2 - the start of a capture group whose end
4421            //                          has not yet been reached (and might not ever be).
4422        case URX_START_CAPTURE:
4423            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4424            fp->fExtra[opValue+2] = fp->fInputIdx;
4425            break;
4426
4427
4428        case URX_END_CAPTURE:
4429            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4430            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4431            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4432            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4433            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4434            break;
4435
4436
4437        case URX_DOLLAR:                   //  $, test for End of line
4438            //     or for position before new line at end of input
4439            if (fp->fInputIdx < fAnchorLimit-2) {
4440                // We are no where near the end of input.  Fail.
4441                //   This is the common case.  Keep it first.
4442                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4443                break;
4444            }
4445            if (fp->fInputIdx >= fAnchorLimit) {
4446                // We really are at the end of input.  Success.
4447                fHitEnd = TRUE;
4448                fRequireEnd = TRUE;
4449                break;
4450            }
4451
4452            // If we are positioned just before a new-line that is located at the
4453            //   end of input, succeed.
4454            if (fp->fInputIdx == fAnchorLimit-1) {
4455                UChar32 c;
4456                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4457
4458                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4459                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4460                        // At new-line at end of input. Success
4461                        fHitEnd = TRUE;
4462                        fRequireEnd = TRUE;
4463                        break;
4464                    }
4465                }
4466            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4467                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4468                    fHitEnd = TRUE;
4469                    fRequireEnd = TRUE;
4470                    break;                         // At CR/LF at end of input.  Success
4471            }
4472
4473            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4474
4475            break;
4476
4477
4478        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4479            if (fp->fInputIdx >= fAnchorLimit-1) {
4480                // Either at the last character of input, or off the end.
4481                if (fp->fInputIdx == fAnchorLimit-1) {
4482                    // At last char of input.  Success if it's a new line.
4483                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4484                        fHitEnd = TRUE;
4485                        fRequireEnd = TRUE;
4486                        break;
4487                    }
4488                } else {
4489                    // Off the end of input.  Success.
4490                    fHitEnd = TRUE;
4491                    fRequireEnd = TRUE;
4492                    break;
4493                }
4494            }
4495
4496            // Not at end of input.  Back-track out.
4497            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4498            break;
4499
4500
4501        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4502            {
4503                if (fp->fInputIdx >= fAnchorLimit) {
4504                    // We really are at the end of input.  Success.
4505                    fHitEnd = TRUE;
4506                    fRequireEnd = TRUE;
4507                    break;
4508                }
4509                // If we are positioned just before a new-line, succeed.
4510                // It makes no difference where the new-line is within the input.
4511                UChar32 c = inputBuf[fp->fInputIdx];
4512                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4513                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4514                    //  In multi-line mode, hitting a new-line just before the end of input does not
4515                    //   set the hitEnd or requireEnd flags
4516                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4517                        break;
4518                    }
4519                }
4520                // not at a new line.  Fail.
4521                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4522            }
4523            break;
4524
4525
4526        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4527            {
4528                if (fp->fInputIdx >= fAnchorLimit) {
4529                    // We really are at the end of input.  Success.
4530                    fHitEnd = TRUE;
4531                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4532                    break;               //   adding a new-line would not lose the match.
4533                }
4534                // If we are not positioned just before a new-line, the test fails; backtrack out.
4535                // It makes no difference where the new-line is within the input.
4536                if (inputBuf[fp->fInputIdx] != 0x0a) {
4537                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538                }
4539            }
4540            break;
4541
4542
4543        case URX_CARET:                    //  ^, test for start of line
4544            if (fp->fInputIdx != fAnchorStart) {
4545                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4546            }
4547            break;
4548
4549
4550        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4551            {
4552                if (fp->fInputIdx == fAnchorStart) {
4553                    // We are at the start input.  Success.
4554                    break;
4555                }
4556                // Check whether character just before the current pos is a new-line
4557                //   unless we are at the end of input
4558                UChar  c = inputBuf[fp->fInputIdx - 1];
4559                if ((fp->fInputIdx < fAnchorLimit) &&
4560                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4561                    //  It's a new-line.  ^ is true.  Success.
4562                    //  TODO:  what should be done with positions between a CR and LF?
4563                    break;
4564                }
4565                // Not at the start of a line.  Fail.
4566                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4567            }
4568            break;
4569
4570
4571        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4572            {
4573                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4574                if (fp->fInputIdx <= fAnchorStart) {
4575                    // We are at the start input.  Success.
4576                    break;
4577                }
4578                // Check whether character just before the current pos is a new-line
4579                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4580                UChar  c = inputBuf[fp->fInputIdx - 1];
4581                if (c != 0x0a) {
4582                    // Not at the start of a line.  Back-track out.
4583                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4584                }
4585            }
4586            break;
4587
4588        case URX_BACKSLASH_B:          // Test for word boundaries
4589            {
4590                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4591                success ^= (UBool)(opValue != 0);     // flip sense for \B
4592                if (!success) {
4593                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594                }
4595            }
4596            break;
4597
4598
4599        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4600            {
4601                UBool success = isUWordBoundary(fp->fInputIdx);
4602                success ^= (UBool)(opValue != 0);     // flip sense for \B
4603                if (!success) {
4604                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4605                }
4606            }
4607            break;
4608
4609
4610        case URX_BACKSLASH_D:            // Test for decimal digit
4611            {
4612                if (fp->fInputIdx >= fActiveLimit) {
4613                    fHitEnd = TRUE;
4614                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615                    break;
4616                }
4617
4618                UChar32 c;
4619                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4621                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4622                success ^= (UBool)(opValue != 0);        // flip sense for \D
4623                if (!success) {
4624                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4625                }
4626            }
4627            break;
4628
4629
4630        case URX_BACKSLASH_G:          // Test for position at end of previous match
4631            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4632                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4633            }
4634            break;
4635
4636
4637        case URX_BACKSLASH_X:
4638        //  Match a Grapheme, as defined by Unicode TR 29.
4639        //  Differs slightly from Perl, which consumes combining marks independently
4640        //    of context.
4641        {
4642
4643            // Fail if at end of input
4644            if (fp->fInputIdx >= fActiveLimit) {
4645                fHitEnd = TRUE;
4646                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4647                break;
4648            }
4649
4650            // Examine (and consume) the current char.
4651            //   Dispatch into a little state machine, based on the char.
4652            UChar32  c;
4653            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4654            UnicodeSet **sets = fPattern->fStaticSets;
4655            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4656            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4657            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4658            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4659            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4660            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4661            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4662            goto GC_Extend;
4663
4664
4665
4666GC_L:
4667            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4668            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4669            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4670            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4671            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4672            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4673            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4674            goto GC_Extend;
4675
4676GC_V:
4677            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4678            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4679            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4680            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4681            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4682            goto GC_Extend;
4683
4684GC_T:
4685            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4686            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4687            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4688            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4689            goto GC_Extend;
4690
4691GC_Extend:
4692            // Combining characters are consumed here
4693            for (;;) {
4694                if (fp->fInputIdx >= fActiveLimit) {
4695                    break;
4696                }
4697                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4698                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4699                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4700                    break;
4701                }
4702            }
4703            goto GC_Done;
4704
4705GC_Control:
4706            // Most control chars stand alone (don't combine with combining chars),
4707            //   except for that CR/LF sequence is a single grapheme cluster.
4708            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4709                fp->fInputIdx++;
4710            }
4711
4712GC_Done:
4713            if (fp->fInputIdx >= fActiveLimit) {
4714                fHitEnd = TRUE;
4715            }
4716            break;
4717        }
4718
4719
4720
4721
4722        case URX_BACKSLASH_Z:          // Test for end of Input
4723            if (fp->fInputIdx < fAnchorLimit) {
4724                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4725            } else {
4726                fHitEnd = TRUE;
4727                fRequireEnd = TRUE;
4728            }
4729            break;
4730
4731
4732
4733        case URX_STATIC_SETREF:
4734            {
4735                // Test input character against one of the predefined sets
4736                //    (Word Characters, for example)
4737                // The high bit of the op value is a flag for the match polarity.
4738                //    0:   success if input char is in set.
4739                //    1:   success if input char is not in set.
4740                if (fp->fInputIdx >= fActiveLimit) {
4741                    fHitEnd = TRUE;
4742                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4743                    break;
4744                }
4745
4746                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4747                opValue &= ~URX_NEG_SET;
4748                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4749
4750                UChar32 c;
4751                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4752                if (c < 256) {
4753                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4754                    if (s8->contains(c)) {
4755                        success = !success;
4756                    }
4757                } else {
4758                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4759                    if (s->contains(c)) {
4760                        success = !success;
4761                    }
4762                }
4763                if (!success) {
4764                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765                }
4766            }
4767            break;
4768
4769
4770        case URX_STAT_SETREF_N:
4771            {
4772                // Test input character for NOT being a member of  one of
4773                //    the predefined sets (Word Characters, for example)
4774                if (fp->fInputIdx >= fActiveLimit) {
4775                    fHitEnd = TRUE;
4776                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4777                    break;
4778                }
4779
4780                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4781
4782                UChar32  c;
4783                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4784                if (c < 256) {
4785                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4786                    if (s8->contains(c) == FALSE) {
4787                        break;
4788                    }
4789                } else {
4790                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4791                    if (s->contains(c) == FALSE) {
4792                        break;
4793                    }
4794                }
4795                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4796            }
4797            break;
4798
4799
4800        case URX_SETREF:
4801            {
4802                if (fp->fInputIdx >= fActiveLimit) {
4803                    fHitEnd = TRUE;
4804                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4805                    break;
4806                }
4807
4808                U_ASSERT(opValue > 0 && opValue < sets->size());
4809
4810                // There is input left.  Pick up one char and test it for set membership.
4811                UChar32  c;
4812                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4813                if (c<256) {
4814                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4815                    if (s8->contains(c)) {
4816                        // The character is in the set.  A Match.
4817                        break;
4818                    }
4819                } else {
4820                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4821                    if (s->contains(c)) {
4822                        // The character is in the set.  A Match.
4823                        break;
4824                    }
4825                }
4826
4827                // the character wasn't in the set.
4828                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4829            }
4830            break;
4831
4832
4833        case URX_DOTANY:
4834            {
4835                // . matches anything, but stops at end-of-line.
4836                if (fp->fInputIdx >= fActiveLimit) {
4837                    // At end of input.  Match failed.  Backtrack out.
4838                    fHitEnd = TRUE;
4839                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840                    break;
4841                }
4842
4843                // There is input left.  Advance over one char, unless we've hit end-of-line
4844                UChar32  c;
4845                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4846                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
4847                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4848                    // End of line in normal mode.   . does not match.
4849                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4850                    break;
4851                }
4852            }
4853            break;
4854
4855
4856        case URX_DOTANY_ALL:
4857            {
4858                // . in dot-matches-all (including new lines) mode
4859                if (fp->fInputIdx >= fActiveLimit) {
4860                    // At end of input.  Match failed.  Backtrack out.
4861                    fHitEnd = TRUE;
4862                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4863                    break;
4864                }
4865
4866                // There is input left.  Advance over one char, except if we are
4867                //   at a cr/lf, advance over both of them.
4868                UChar32 c;
4869                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4870                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4871                    // In the case of a CR/LF, we need to advance over both.
4872                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4873                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4874                    }
4875                }
4876            }
4877            break;
4878
4879
4880        case URX_DOTANY_UNIX:
4881            {
4882                // '.' operator, matches all, but stops at end-of-line.
4883                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4884                if (fp->fInputIdx >= fActiveLimit) {
4885                    // At end of input.  Match failed.  Backtrack out.
4886                    fHitEnd = TRUE;
4887                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                    break;
4889                }
4890
4891                // There is input left.  Advance over one char, unless we've hit end-of-line
4892                UChar32 c;
4893                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894                if (c == 0x0a) {
4895                    // End of line in normal mode.   '.' does not match the \n
4896                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4897                }
4898            }
4899            break;
4900
4901
4902        case URX_JMP:
4903            fp->fPatIdx = opValue;
4904            break;
4905
4906        case URX_FAIL:
4907            isMatch = FALSE;
4908            goto breakFromLoop;
4909
4910        case URX_JMP_SAV:
4911            U_ASSERT(opValue < fPattern->fCompiledPat->size());
4912            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4913            fp->fPatIdx = opValue;                         // Then JMP.
4914            break;
4915
4916        case URX_JMP_SAV_X:
4917            // This opcode is used with (x)+, when x can match a zero length string.
4918            // Same as JMP_SAV, except conditional on the match having made forward progress.
4919            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4920            //   data address of the input position at the start of the loop.
4921            {
4922                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4923                int32_t  stoOp = (int32_t)pat[opValue-1];
4924                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4925                int32_t  frameLoc = URX_VAL(stoOp);
4926                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4927                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4928                U_ASSERT(prevInputIdx <= fp->fInputIdx);
4929                if (prevInputIdx < fp->fInputIdx) {
4930                    // The match did make progress.  Repeat the loop.
4931                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4932                    fp->fPatIdx = opValue;
4933                    fp->fExtra[frameLoc] = fp->fInputIdx;
4934                }
4935                // If the input position did not advance, we do nothing here,
4936                //   execution will fall out of the loop.
4937            }
4938            break;
4939
4940        case URX_CTR_INIT:
4941            {
4942                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4943                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
4944
4945                // Pick up the three extra operands that CTR_INIT has, and
4946                //    skip the pattern location counter past
4947                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4948                fp->fPatIdx += 3;
4949                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4950                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4951                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4952                U_ASSERT(minCount>=0);
4953                U_ASSERT(maxCount>=minCount || maxCount==-1);
4954                U_ASSERT(loopLoc>fp->fPatIdx);
4955
4956                if (minCount == 0) {
4957                    fp = StateSave(fp, loopLoc+1, status);
4958                }
4959                if (maxCount == 0) {
4960                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961                }
4962            }
4963            break;
4964
4965        case URX_CTR_LOOP:
4966            {
4967                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4968                int32_t initOp = (int32_t)pat[opValue];
4969                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4970                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4971                int32_t minCount  = (int32_t)pat[opValue+2];
4972                int32_t maxCount  = (int32_t)pat[opValue+3];
4973                // Increment the counter.  Note: we DIDN'T worry about counter
4974                //   overflow, since the data comes from UnicodeStrings, which
4975                //   stores its length in an int32_t. Do we have to think about
4976                //   this now that we're using UText? Probably not, since the length
4977                //   in UChar32s is still an int32_t.
4978                (*pCounter)++;
4979                U_ASSERT(*pCounter > 0);
4980                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
4981                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
4982                    break;
4983                }
4984                if (*pCounter >= minCount) {
4985                    fp = StateSave(fp, fp->fPatIdx, status);
4986                }
4987                fp->fPatIdx = opValue + 4;    // Loop back.
4988            }
4989            break;
4990
4991        case URX_CTR_INIT_NG:
4992            {
4993                // Initialize a non-greedy loop
4994                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4995                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
4996
4997                // Pick up the three extra operands that CTR_INIT has, and
4998                //    skip the pattern location counter past
4999                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5000                fp->fPatIdx += 3;
5001                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5002                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5003                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5004                U_ASSERT(minCount>=0);
5005                U_ASSERT(maxCount>=minCount || maxCount==-1);
5006                U_ASSERT(loopLoc>fp->fPatIdx);
5007
5008                if (minCount == 0) {
5009                    if (maxCount != 0) {
5010                        fp = StateSave(fp, fp->fPatIdx, status);
5011                    }
5012                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5013                }
5014            }
5015            break;
5016
5017        case URX_CTR_LOOP_NG:
5018            {
5019                // Non-greedy {min, max} loops
5020                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5021                int32_t initOp = (int32_t)pat[opValue];
5022                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5023                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5024                int32_t minCount  = (int32_t)pat[opValue+2];
5025                int32_t maxCount  = (int32_t)pat[opValue+3];
5026                // Increment the counter.  Note: we DIDN'T worry about counter
5027                //   overflow, since the data comes from UnicodeStrings, which
5028                //   stores its length in an int32_t. Do we have to think about
5029                //   this now that we're using UText? Probably not, since the length
5030                //   in UChar32s is still an int32_t.
5031                (*pCounter)++;
5032                U_ASSERT(*pCounter > 0);
5033
5034                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5035                    // The loop has matched the maximum permitted number of times.
5036                    //   Break out of here with no action.  Matching will
5037                    //   continue with the following pattern.
5038                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5039                    break;
5040                }
5041
5042                if (*pCounter < minCount) {
5043                    // We haven't met the minimum number of matches yet.
5044                    //   Loop back for another one.
5045                    fp->fPatIdx = opValue + 4;    // Loop back.
5046                } else {
5047                    // We do have the minimum number of matches.
5048                    //   Fall into the following pattern, but first do
5049                    //   a state save to the top of the loop, so that a failure
5050                    //   in the following pattern will try another iteration of the loop.
5051                    fp = StateSave(fp, opValue + 4, status);
5052                }
5053            }
5054            break;
5055
5056        case URX_STO_SP:
5057            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5058            fData[opValue] = fStack->size();
5059            break;
5060
5061        case URX_LD_SP:
5062            {
5063                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5064                int32_t newStackSize = (int32_t)fData[opValue];
5065                U_ASSERT(newStackSize <= fStack->size());
5066                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5067                if (newFP == (int64_t *)fp) {
5068                    break;
5069                }
5070                int32_t i;
5071                for (i=0; i<fFrameSize; i++) {
5072                    newFP[i] = ((int64_t *)fp)[i];
5073                }
5074                fp = (REStackFrame *)newFP;
5075                fStack->setSize(newStackSize);
5076            }
5077            break;
5078
5079        case URX_BACKREF:
5080            {
5081                U_ASSERT(opValue < fFrameSize);
5082                int64_t groupStartIdx = fp->fExtra[opValue];
5083                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5084                U_ASSERT(groupStartIdx <= groupEndIdx);
5085                int64_t inputIndex = fp->fInputIdx;
5086                if (groupStartIdx < 0) {
5087                    // This capture group has not participated in the match thus far,
5088                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5089                    break;
5090                }
5091                UBool success = TRUE;
5092                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5093                    if (inputIndex >= fActiveLimit) {
5094                        success = FALSE;
5095                        fHitEnd = TRUE;
5096                        break;
5097                    }
5098                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5099                        success = FALSE;
5100                        break;
5101                    }
5102                }
5103                if (success) {
5104                    fp->fInputIdx = inputIndex;
5105                } else {
5106                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5107                }
5108            }
5109            break;
5110
5111        case URX_BACKREF_I:
5112            {
5113                U_ASSERT(opValue < fFrameSize);
5114                int64_t groupStartIdx = fp->fExtra[opValue];
5115                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5116                U_ASSERT(groupStartIdx <= groupEndIdx);
5117                if (groupStartIdx < 0) {
5118                    // This capture group has not participated in the match thus far,
5119                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5120                    break;
5121                }
5122                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5123                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5124
5125                //   Note: if the capture group match was of an empty string the backref
5126                //         match succeeds.  Verified by testing:  Perl matches succeed
5127                //         in this case, so we do too.
5128
5129                UBool success = TRUE;
5130                for (;;) {
5131                    UChar32 captureGroupChar = captureGroupItr.next();
5132                    if (captureGroupChar == U_SENTINEL) {
5133                        success = TRUE;
5134                        break;
5135                    }
5136                    UChar32 inputChar = inputItr.next();
5137                    if (inputChar == U_SENTINEL) {
5138                        success = FALSE;
5139                        fHitEnd = TRUE;
5140                        break;
5141                    }
5142                    if (inputChar != captureGroupChar) {
5143                        success = FALSE;
5144                        break;
5145                    }
5146                }
5147
5148                if (success && inputItr.inExpansion()) {
5149                    // We otained a match by consuming part of a string obtained from
5150                    // case-folding a single code point of the input text.
5151                    // This does not count as an overall match.
5152                    success = FALSE;
5153                }
5154
5155                if (success) {
5156                    fp->fInputIdx = inputItr.getIndex();
5157                } else {
5158                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5159                }
5160            }
5161            break;
5162
5163        case URX_STO_INP_LOC:
5164            {
5165                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5166                fp->fExtra[opValue] = fp->fInputIdx;
5167            }
5168            break;
5169
5170        case URX_JMPX:
5171            {
5172                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5173                fp->fPatIdx += 1;
5174                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5175                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5176                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5177                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5178                if (savedInputIdx < fp->fInputIdx) {
5179                    fp->fPatIdx = opValue;                               // JMP
5180                } else {
5181                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5182                }
5183            }
5184            break;
5185
5186        case URX_LA_START:
5187            {
5188                // Entering a lookahead block.
5189                // Save Stack Ptr, Input Pos.
5190                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5191                fData[opValue]   = fStack->size();
5192                fData[opValue+1] = fp->fInputIdx;
5193                fActiveStart     = fLookStart;          // Set the match region change for
5194                fActiveLimit     = fLookLimit;          //   transparent bounds.
5195            }
5196            break;
5197
5198        case URX_LA_END:
5199            {
5200                // Leaving a look-ahead block.
5201                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5202                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5203                int32_t stackSize = fStack->size();
5204                int32_t newStackSize = (int32_t)fData[opValue];
5205                U_ASSERT(stackSize >= newStackSize);
5206                if (stackSize > newStackSize) {
5207                    // Copy the current top frame back to the new (cut back) top frame.
5208                    //   This makes the capture groups from within the look-ahead
5209                    //   expression available.
5210                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5211                    int32_t i;
5212                    for (i=0; i<fFrameSize; i++) {
5213                        newFP[i] = ((int64_t *)fp)[i];
5214                    }
5215                    fp = (REStackFrame *)newFP;
5216                    fStack->setSize(newStackSize);
5217                }
5218                fp->fInputIdx = fData[opValue+1];
5219
5220                // Restore the active region bounds in the input string; they may have
5221                //    been changed because of transparent bounds on a Region.
5222                fActiveStart = fRegionStart;
5223                fActiveLimit = fRegionLimit;
5224            }
5225            break;
5226
5227        case URX_ONECHAR_I:
5228            if (fp->fInputIdx < fActiveLimit) {
5229                UChar32 c;
5230                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5231                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5232                    break;
5233                }
5234            } else {
5235                fHitEnd = TRUE;
5236            }
5237            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5238            break;
5239
5240        case URX_STRING_I:
5241            // Case-insensitive test input against a literal string.
5242            // Strings require two slots in the compiled pattern, one for the
5243            //   offset to the string text, and one for the length.
5244            //   The compiled string has already been case folded.
5245            {
5246                const UChar *patternString = litText + opValue;
5247
5248                op      = (int32_t)pat[fp->fPatIdx];
5249                fp->fPatIdx++;
5250                opType  = URX_TYPE(op);
5251                opValue = URX_VAL(op);
5252                U_ASSERT(opType == URX_STRING_LEN);
5253                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5254
5255                UChar32      cText;
5256                UChar32      cPattern;
5257                UBool        success = TRUE;
5258                int32_t      patternStringIdx  = 0;
5259                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5260                while (patternStringIdx < patternStringLen) {
5261                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5262                    cText = inputIterator.next();
5263                    if (cText != cPattern) {
5264                        success = FALSE;
5265                        if (cText == U_SENTINEL) {
5266                            fHitEnd = TRUE;
5267                        }
5268                        break;
5269                    }
5270                }
5271                if (inputIterator.inExpansion()) {
5272                    success = FALSE;
5273                }
5274
5275                if (success) {
5276                    fp->fInputIdx = inputIterator.getIndex();
5277                } else {
5278                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5279                }
5280            }
5281            break;
5282
5283        case URX_LB_START:
5284            {
5285                // Entering a look-behind block.
5286                // Save Stack Ptr, Input Pos.
5287                //   TODO:  implement transparent bounds.  Ticket #6067
5288                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5289                fData[opValue]   = fStack->size();
5290                fData[opValue+1] = fp->fInputIdx;
5291                // Init the variable containing the start index for attempted matches.
5292                fData[opValue+2] = -1;
5293                // Save input string length, then reset to pin any matches to end at
5294                //   the current position.
5295                fData[opValue+3] = fActiveLimit;
5296                fActiveLimit     = fp->fInputIdx;
5297            }
5298            break;
5299
5300
5301        case URX_LB_CONT:
5302            {
5303                // Positive Look-Behind, at top of loop checking for matches of LB expression
5304                //    at all possible input starting positions.
5305
5306                // Fetch the min and max possible match lengths.  They are the operands
5307                //   of this op in the pattern.
5308                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5309                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5310                U_ASSERT(minML <= maxML);
5311                U_ASSERT(minML >= 0);
5312
5313                // Fetch (from data) the last input index where a match was attempted.
5314                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5315                int64_t  *lbStartIdx = &fData[opValue+2];
5316                if (*lbStartIdx < 0) {
5317                    // First time through loop.
5318                    *lbStartIdx = fp->fInputIdx - minML;
5319                } else {
5320                    // 2nd through nth time through the loop.
5321                    // Back up start position for match by one.
5322                    if (*lbStartIdx == 0) {
5323                        (*lbStartIdx)--;
5324                    } else {
5325                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5326                    }
5327                }
5328
5329                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5330                    // We have tried all potential match starting points without
5331                    //  getting a match.  Backtrack out, and out of the
5332                    //   Look Behind altogether.
5333                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5334                    int64_t restoreInputLen = fData[opValue+3];
5335                    U_ASSERT(restoreInputLen >= fActiveLimit);
5336                    U_ASSERT(restoreInputLen <= fInputLength);
5337                    fActiveLimit = restoreInputLen;
5338                    break;
5339                }
5340
5341                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5342                //      (successful match will fall off the end of the loop.)
5343                fp = StateSave(fp, fp->fPatIdx-3, status);
5344                fp->fInputIdx =  *lbStartIdx;
5345            }
5346            break;
5347
5348        case URX_LB_END:
5349            // End of a look-behind block, after a successful match.
5350            {
5351                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5352                if (fp->fInputIdx != fActiveLimit) {
5353                    //  The look-behind expression matched, but the match did not
5354                    //    extend all the way to the point that we are looking behind from.
5355                    //  FAIL out of here, which will take us back to the LB_CONT, which
5356                    //     will retry the match starting at another position or fail
5357                    //     the look-behind altogether, whichever is appropriate.
5358                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5359                    break;
5360                }
5361
5362                // Look-behind match is good.  Restore the orignal input string length,
5363                //   which had been truncated to pin the end of the lookbehind match to the
5364                //   position being looked-behind.
5365                int64_t originalInputLen = fData[opValue+3];
5366                U_ASSERT(originalInputLen >= fActiveLimit);
5367                U_ASSERT(originalInputLen <= fInputLength);
5368                fActiveLimit = originalInputLen;
5369            }
5370            break;
5371
5372
5373        case URX_LBN_CONT:
5374            {
5375                // Negative Look-Behind, at top of loop checking for matches of LB expression
5376                //    at all possible input starting positions.
5377
5378                // Fetch the extra parameters of this op.
5379                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5380                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5381                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5382                continueLoc = URX_VAL(continueLoc);
5383                U_ASSERT(minML <= maxML);
5384                U_ASSERT(minML >= 0);
5385                U_ASSERT(continueLoc > fp->fPatIdx);
5386
5387                // Fetch (from data) the last input index where a match was attempted.
5388                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5389                int64_t  *lbStartIdx = &fData[opValue+2];
5390                if (*lbStartIdx < 0) {
5391                    // First time through loop.
5392                    *lbStartIdx = fp->fInputIdx - minML;
5393                } else {
5394                    // 2nd through nth time through the loop.
5395                    // Back up start position for match by one.
5396                    if (*lbStartIdx == 0) {
5397                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5398                    } else {
5399                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5400                    }
5401                }
5402
5403                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5404                    // We have tried all potential match starting points without
5405                    //  getting a match, which means that the negative lookbehind as
5406                    //  a whole has succeeded.  Jump forward to the continue location
5407                    int64_t restoreInputLen = fData[opValue+3];
5408                    U_ASSERT(restoreInputLen >= fActiveLimit);
5409                    U_ASSERT(restoreInputLen <= fInputLength);
5410                    fActiveLimit = restoreInputLen;
5411                    fp->fPatIdx = continueLoc;
5412                    break;
5413                }
5414
5415                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5416                //      (successful match will cause a FAIL out of the loop altogether.)
5417                fp = StateSave(fp, fp->fPatIdx-4, status);
5418                fp->fInputIdx =  *lbStartIdx;
5419            }
5420            break;
5421
5422        case URX_LBN_END:
5423            // End of a negative look-behind block, after a successful match.
5424            {
5425                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5426                if (fp->fInputIdx != fActiveLimit) {
5427                    //  The look-behind expression matched, but the match did not
5428                    //    extend all the way to the point that we are looking behind from.
5429                    //  FAIL out of here, which will take us back to the LB_CONT, which
5430                    //     will retry the match starting at another position or succeed
5431                    //     the look-behind altogether, whichever is appropriate.
5432                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5433                    break;
5434                }
5435
5436                // Look-behind expression matched, which means look-behind test as
5437                //   a whole Fails
5438
5439                //   Restore the orignal input string length, which had been truncated
5440                //   inorder to pin the end of the lookbehind match
5441                //   to the position being looked-behind.
5442                int64_t originalInputLen = fData[opValue+3];
5443                U_ASSERT(originalInputLen >= fActiveLimit);
5444                U_ASSERT(originalInputLen <= fInputLength);
5445                fActiveLimit = originalInputLen;
5446
5447                // Restore original stack position, discarding any state saved
5448                //   by the successful pattern match.
5449                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5450                int32_t newStackSize = (int32_t)fData[opValue];
5451                U_ASSERT(fStack->size() > newStackSize);
5452                fStack->setSize(newStackSize);
5453
5454                //  FAIL, which will take control back to someplace
5455                //  prior to entering the look-behind test.
5456                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5457            }
5458            break;
5459
5460
5461        case URX_LOOP_SR_I:
5462            // Loop Initialization for the optimized implementation of
5463            //     [some character set]*
5464            //   This op scans through all matching input.
5465            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5466            {
5467                U_ASSERT(opValue > 0 && opValue < sets->size());
5468                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5469                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5470
5471                // Loop through input, until either the input is exhausted or
5472                //   we reach a character that is not a member of the set.
5473                int32_t ix = (int32_t)fp->fInputIdx;
5474                for (;;) {
5475                    if (ix >= fActiveLimit) {
5476                        fHitEnd = TRUE;
5477                        break;
5478                    }
5479                    UChar32   c;
5480                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5481                    if (c<256) {
5482                        if (s8->contains(c) == FALSE) {
5483                            U16_BACK_1(inputBuf, 0, ix);
5484                            break;
5485                        }
5486                    } else {
5487                        if (s->contains(c) == FALSE) {
5488                            U16_BACK_1(inputBuf, 0, ix);
5489                            break;
5490                        }
5491                    }
5492                }
5493
5494                // If there were no matching characters, skip over the loop altogether.
5495                //   The loop doesn't run at all, a * op always succeeds.
5496                if (ix == fp->fInputIdx) {
5497                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5498                    break;
5499                }
5500
5501                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5502                //   must follow.  It's operand is the stack location
5503                //   that holds the starting input index for the match of this [set]*
5504                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5505                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5506                int32_t stackLoc = URX_VAL(loopcOp);
5507                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5508                fp->fExtra[stackLoc] = fp->fInputIdx;
5509                fp->fInputIdx = ix;
5510
5511                // Save State to the URX_LOOP_C op that follows this one,
5512                //   so that match failures in the following code will return to there.
5513                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5514                fp = StateSave(fp, fp->fPatIdx, status);
5515                fp->fPatIdx++;
5516            }
5517            break;
5518
5519
5520        case URX_LOOP_DOT_I:
5521            // Loop Initialization for the optimized implementation of .*
5522            //   This op scans through all remaining input.
5523            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5524            {
5525                // Loop through input until the input is exhausted (we reach an end-of-line)
5526                // In DOTALL mode, we can just go straight to the end of the input.
5527                int32_t ix;
5528                if ((opValue & 1) == 1) {
5529                    // Dot-matches-All mode.  Jump straight to the end of the string.
5530                    ix = (int32_t)fActiveLimit;
5531                    fHitEnd = TRUE;
5532                } else {
5533                    // NOT DOT ALL mode.  Line endings do not match '.'
5534                    // Scan forward until a line ending or end of input.
5535                    ix = (int32_t)fp->fInputIdx;
5536                    for (;;) {
5537                        if (ix >= fActiveLimit) {
5538                            fHitEnd = TRUE;
5539                            break;
5540                        }
5541                        UChar32   c;
5542                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5543                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5544                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5545                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5546                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5547                                //  char is a line ending.  Put the input pos back to the
5548                                //    line ending char, and exit the scanning loop.
5549                                U16_BACK_1(inputBuf, 0, ix);
5550                                break;
5551                            }
5552                        }
5553                    }
5554                }
5555
5556                // If there were no matching characters, skip over the loop altogether.
5557                //   The loop doesn't run at all, a * op always succeeds.
5558                if (ix == fp->fInputIdx) {
5559                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5560                    break;
5561                }
5562
5563                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5564                //   must follow.  It's operand is the stack location
5565                //   that holds the starting input index for the match of this .*
5566                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5567                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5568                int32_t stackLoc = URX_VAL(loopcOp);
5569                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5570                fp->fExtra[stackLoc] = fp->fInputIdx;
5571                fp->fInputIdx = ix;
5572
5573                // Save State to the URX_LOOP_C op that follows this one,
5574                //   so that match failures in the following code will return to there.
5575                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5576                fp = StateSave(fp, fp->fPatIdx, status);
5577                fp->fPatIdx++;
5578            }
5579            break;
5580
5581
5582        case URX_LOOP_C:
5583            {
5584                U_ASSERT(opValue>=0 && opValue<fFrameSize);
5585                backSearchIndex = (int32_t)fp->fExtra[opValue];
5586                U_ASSERT(backSearchIndex <= fp->fInputIdx);
5587                if (backSearchIndex == fp->fInputIdx) {
5588                    // We've backed up the input idx to the point that the loop started.
5589                    // The loop is done.  Leave here without saving state.
5590                    //  Subsequent failures won't come back here.
5591                    break;
5592                }
5593                // Set up for the next iteration of the loop, with input index
5594                //   backed up by one from the last time through,
5595                //   and a state save to this instruction in case the following code fails again.
5596                //   (We're going backwards because this loop emulates stack unwinding, not
5597                //    the initial scan forward.)
5598                U_ASSERT(fp->fInputIdx > 0);
5599                UChar32 prevC;
5600                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5601
5602                if (prevC == 0x0a &&
5603                    fp->fInputIdx > backSearchIndex &&
5604                    inputBuf[fp->fInputIdx-1] == 0x0d) {
5605                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5606                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5607                        // .*, stepping back over CRLF pair.
5608                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5609                    }
5610                }
5611
5612
5613                fp = StateSave(fp, fp->fPatIdx-1, status);
5614            }
5615            break;
5616
5617
5618
5619        default:
5620            // Trouble.  The compiled pattern contains an entry with an
5621            //           unrecognized type tag.
5622            U_ASSERT(FALSE);
5623        }
5624
5625        if (U_FAILURE(status)) {
5626            isMatch = FALSE;
5627            break;
5628        }
5629    }
5630
5631breakFromLoop:
5632    fMatch = isMatch;
5633    if (isMatch) {
5634        fLastMatchEnd = fMatchEnd;
5635        fMatchStart   = startIdx;
5636        fMatchEnd     = fp->fInputIdx;
5637        if (fTraceDebug) {
5638            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
5639        }
5640    }
5641    else
5642    {
5643        if (fTraceDebug) {
5644            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5645        }
5646    }
5647
5648    fFrame = fp;                // The active stack frame when the engine stopped.
5649    //   Contains the capture group results that we need to
5650    //    access later.
5651
5652    return;
5653}
5654
5655
5656UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5657
5658U_NAMESPACE_END
5659
5660#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5661