1/*
2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrPattern.h"
29
30#include "Yarr.h"
31#include "YarrCanonicalizeUCS2.h"
32#include "YarrParser.h"
33#include <wtf/Vector.h>
34
35using namespace WTF;
36
37namespace JSC { namespace Yarr {
38
39#include "RegExpJitTables.h"
40
41class CharacterClassConstructor {
42public:
43    CharacterClassConstructor(bool isCaseInsensitive = false)
44        : m_isCaseInsensitive(isCaseInsensitive)
45    {
46    }
47
48    void reset()
49    {
50        m_matches.clear();
51        m_ranges.clear();
52        m_matchesUnicode.clear();
53        m_rangesUnicode.clear();
54    }
55
56    void append(const CharacterClass* other)
57    {
58        for (size_t i = 0; i < other->m_matches.size(); ++i)
59            addSorted(m_matches, other->m_matches[i]);
60        for (size_t i = 0; i < other->m_ranges.size(); ++i)
61            addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
62        for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
63            addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
64        for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
65            addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
66    }
67
68    void putChar(UChar ch)
69    {
70        // Handle ascii cases.
71        if (ch <= 0x7f) {
72            if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
73                addSorted(m_matches, toASCIIUpper(ch));
74                addSorted(m_matches, toASCIILower(ch));
75            } else
76                addSorted(m_matches, ch);
77            return;
78        }
79
80        // Simple case, not a case-insensitive match.
81        if (!m_isCaseInsensitive) {
82            addSorted(m_matchesUnicode, ch);
83            return;
84        }
85
86        // Add multiple matches, if necessary.
87        const UCS2CanonicalizationRange* info = rangeInfoFor(ch);
88        if (info->type == CanonicalizeUnique)
89            addSorted(m_matchesUnicode, ch);
90        else
91            putUnicodeIgnoreCase(ch, info);
92    }
93
94    void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info)
95    {
96        ASSERT(m_isCaseInsensitive);
97        ASSERT(ch > 0x7f);
98        ASSERT(ch >= info->begin && ch <= info->end);
99        ASSERT(info->type != CanonicalizeUnique);
100        if (info->type == CanonicalizeSet) {
101            for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
102                addSorted(m_matchesUnicode, ch);
103        } else {
104            addSorted(m_matchesUnicode, ch);
105            addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
106        }
107    }
108
109    void putRange(UChar lo, UChar hi)
110    {
111        if (lo <= 0x7f) {
112            char asciiLo = lo;
113            char asciiHi = std::min(hi, (UChar)0x7f);
114            addSortedRange(m_ranges, lo, asciiHi);
115
116            if (m_isCaseInsensitive) {
117                if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
118                    addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
119                if ((asciiLo <= 'z') && (asciiHi >= 'a'))
120                    addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
121            }
122        }
123        if (hi <= 0x7f)
124            return;
125
126        lo = std::max(lo, (UChar)0x80);
127        addSortedRange(m_rangesUnicode, lo, hi);
128
129        if (!m_isCaseInsensitive)
130            return;
131
132        const UCS2CanonicalizationRange* info = rangeInfoFor(lo);
133        while (true) {
134            // Handle the range [lo .. end]
135            UChar end = std::min<UChar>(info->end, hi);
136
137            switch (info->type) {
138            case CanonicalizeUnique:
139                // Nothing to do - no canonical equivalents.
140                break;
141            case CanonicalizeSet: {
142                UChar ch;
143                for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
144                    addSorted(m_matchesUnicode, ch);
145                break;
146            }
147            case CanonicalizeRangeLo:
148                addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
149                break;
150            case CanonicalizeRangeHi:
151                addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
152                break;
153            case CanonicalizeAlternatingAligned:
154                // Use addSortedRange since there is likely an abutting range to combine with.
155                if (lo & 1)
156                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
157                if (!(end & 1))
158                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
159                break;
160            case CanonicalizeAlternatingUnaligned:
161                // Use addSortedRange since there is likely an abutting range to combine with.
162                if (!(lo & 1))
163                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
164                if (end & 1)
165                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
166                break;
167            }
168
169            if (hi == end)
170                return;
171
172            ++info;
173            lo = info->begin;
174        };
175
176    }
177
178    PassOwnPtr<CharacterClass> charClass()
179    {
180        OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass);
181
182        characterClass->m_matches.swap(m_matches);
183        characterClass->m_ranges.swap(m_ranges);
184        characterClass->m_matchesUnicode.swap(m_matchesUnicode);
185        characterClass->m_rangesUnicode.swap(m_rangesUnicode);
186
187        return characterClass.release();
188    }
189
190private:
191    void addSorted(Vector<UChar>& matches, UChar ch)
192    {
193        unsigned pos = 0;
194        unsigned range = matches.size();
195
196        // binary chop, find position to insert char.
197        while (range) {
198            unsigned index = range >> 1;
199
200            int val = matches[pos+index] - ch;
201            if (!val)
202                return;
203            else if (val > 0)
204                range = index;
205            else {
206                pos += (index+1);
207                range -= (index+1);
208            }
209        }
210
211        if (pos == matches.size())
212            matches.append(ch);
213        else
214            matches.insert(pos, ch);
215    }
216
217    void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
218    {
219        unsigned end = ranges.size();
220
221        // Simple linear scan - I doubt there are that many ranges anyway...
222        // feel free to fix this with something faster (eg binary chop).
223        for (unsigned i = 0; i < end; ++i) {
224            // does the new range fall before the current position in the array
225            if (hi < ranges[i].begin) {
226                // optional optimization: concatenate appending ranges? - may not be worthwhile.
227                if (hi == (ranges[i].begin - 1)) {
228                    ranges[i].begin = lo;
229                    return;
230                }
231                ranges.insert(i, CharacterRange(lo, hi));
232                return;
233            }
234            // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
235            // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
236            // end of the last range they concatenate, which is just as good.
237            if (lo <= (ranges[i].end + 1)) {
238                // found an intersect! we'll replace this entry in the array.
239                ranges[i].begin = std::min(ranges[i].begin, lo);
240                ranges[i].end = std::max(ranges[i].end, hi);
241
242                // now check if the new range can subsume any subsequent ranges.
243                unsigned next = i+1;
244                // each iteration of the loop we will either remove something from the list, or break the loop.
245                while (next < ranges.size()) {
246                    if (ranges[next].begin <= (ranges[i].end + 1)) {
247                        // the next entry now overlaps / concatenates this one.
248                        ranges[i].end = std::max(ranges[i].end, ranges[next].end);
249                        ranges.remove(next);
250                    } else
251                        break;
252                }
253
254                return;
255            }
256        }
257
258        // CharacterRange comes after all existing ranges.
259        ranges.append(CharacterRange(lo, hi));
260    }
261
262    bool m_isCaseInsensitive;
263
264    Vector<UChar> m_matches;
265    Vector<CharacterRange> m_ranges;
266    Vector<UChar> m_matchesUnicode;
267    Vector<CharacterRange> m_rangesUnicode;
268};
269
270class YarrPatternConstructor {
271public:
272    YarrPatternConstructor(YarrPattern& pattern)
273        : m_pattern(pattern)
274        , m_characterClassConstructor(pattern.m_ignoreCase)
275        , m_invertParentheticalAssertion(false)
276    {
277        OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
278        m_pattern.m_body = body.get();
279        m_alternative = body->addNewAlternative();
280        m_pattern.m_disjunctions.append(body.release());
281    }
282
283    ~YarrPatternConstructor()
284    {
285    }
286
287    void reset()
288    {
289        m_pattern.reset();
290        m_characterClassConstructor.reset();
291
292        OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
293        m_pattern.m_body = body.get();
294        m_alternative = body->addNewAlternative();
295        m_pattern.m_disjunctions.append(body.release());
296    }
297
298    void assertionBOL()
299    {
300        if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
301            m_alternative->m_startsWithBOL = true;
302            m_alternative->m_containsBOL = true;
303            m_pattern.m_containsBOL = true;
304        }
305        m_alternative->m_terms.append(PatternTerm::BOL());
306    }
307    void assertionEOL()
308    {
309        m_alternative->m_terms.append(PatternTerm::EOL());
310    }
311    void assertionWordBoundary(bool invert)
312    {
313        m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
314    }
315
316    void atomPatternCharacter(UChar ch)
317    {
318        // We handle case-insensitive checking of unicode characters which do have both
319        // cases by handling them as if they were defined using a CharacterClass.
320        if (!m_pattern.m_ignoreCase || isASCII(ch)) {
321            m_alternative->m_terms.append(PatternTerm(ch));
322            return;
323        }
324
325        const UCS2CanonicalizationRange* info = rangeInfoFor(ch);
326        if (info->type == CanonicalizeUnique) {
327            m_alternative->m_terms.append(PatternTerm(ch));
328            return;
329        }
330
331        m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
332        OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
333        m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
334        m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
335    }
336
337    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
338    {
339        switch (classID) {
340        case DigitClassID:
341            m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
342            break;
343        case SpaceClassID:
344            m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
345            break;
346        case WordClassID:
347            m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
348            break;
349        case NewlineClassID:
350            m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
351            break;
352        }
353    }
354
355    void atomCharacterClassBegin(bool invert = false)
356    {
357        m_invertCharacterClass = invert;
358    }
359
360    void atomCharacterClassAtom(UChar ch)
361    {
362        m_characterClassConstructor.putChar(ch);
363    }
364
365    void atomCharacterClassRange(UChar begin, UChar end)
366    {
367        m_characterClassConstructor.putRange(begin, end);
368    }
369
370    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
371    {
372        ASSERT(classID != NewlineClassID);
373
374        switch (classID) {
375        case DigitClassID:
376            m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
377            break;
378
379        case SpaceClassID:
380            m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
381            break;
382
383        case WordClassID:
384            m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
385            break;
386
387        default:
388            RELEASE_ASSERT_NOT_REACHED();
389        }
390    }
391
392    void atomCharacterClassEnd()
393    {
394        OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
395        m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
396        m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
397    }
398
399    void atomParenthesesSubpatternBegin(bool capture = true)
400    {
401        unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
402        if (capture)
403            m_pattern.m_numSubpatterns++;
404
405        OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
406        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
407        m_alternative = parenthesesDisjunction->addNewAlternative();
408        m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
409    }
410
411    void atomParentheticalAssertionBegin(bool invert = false)
412    {
413        OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
414        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
415        m_alternative = parenthesesDisjunction->addNewAlternative();
416        m_invertParentheticalAssertion = invert;
417        m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
418    }
419
420    void atomParenthesesEnd()
421    {
422        ASSERT(m_alternative->m_parent);
423        ASSERT(m_alternative->m_parent->m_parent);
424
425        PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
426        m_alternative = m_alternative->m_parent->m_parent;
427
428        PatternTerm& lastTerm = m_alternative->lastTerm();
429
430        unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
431        unsigned numBOLAnchoredAlts = 0;
432
433        for (unsigned i = 0; i < numParenAlternatives; i++) {
434            // Bubble up BOL flags
435            if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
436                numBOLAnchoredAlts++;
437        }
438
439        if (numBOLAnchoredAlts) {
440            m_alternative->m_containsBOL = true;
441            // If all the alternatives in parens start with BOL, then so does this one
442            if (numBOLAnchoredAlts == numParenAlternatives)
443                m_alternative->m_startsWithBOL = true;
444        }
445
446        lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
447        m_invertParentheticalAssertion = false;
448    }
449
450    void atomBackReference(unsigned subpatternId)
451    {
452        ASSERT(subpatternId);
453        m_pattern.m_containsBackreferences = true;
454        m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
455
456        if (subpatternId > m_pattern.m_numSubpatterns) {
457            m_alternative->m_terms.append(PatternTerm::ForwardReference());
458            return;
459        }
460
461        PatternAlternative* currentAlternative = m_alternative;
462        ASSERT(currentAlternative);
463
464        // Note to self: if we waited until the AST was baked, we could also remove forwards refs
465        while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
466            PatternTerm& term = currentAlternative->lastTerm();
467            ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
468
469            if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
470                m_alternative->m_terms.append(PatternTerm::ForwardReference());
471                return;
472            }
473        }
474
475        m_alternative->m_terms.append(PatternTerm(subpatternId));
476    }
477
478    // deep copy the argument disjunction.  If filterStartsWithBOL is true,
479    // skip alternatives with m_startsWithBOL set true.
480    PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
481    {
482        OwnPtr<PatternDisjunction> newDisjunction;
483        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
484            PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
485            if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
486                if (!newDisjunction) {
487                    newDisjunction = adoptPtr(new PatternDisjunction());
488                    newDisjunction->m_parent = disjunction->m_parent;
489                }
490                PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
491                newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
492                for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
493                    newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
494            }
495        }
496
497        if (!newDisjunction)
498            return 0;
499
500        PatternDisjunction* copiedDisjunction = newDisjunction.get();
501        m_pattern.m_disjunctions.append(newDisjunction.release());
502        return copiedDisjunction;
503    }
504
505    PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
506    {
507        if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
508            return PatternTerm(term);
509
510        PatternTerm termCopy = term;
511        termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
512        return termCopy;
513    }
514
515    void quantifyAtom(unsigned min, unsigned max, bool greedy)
516    {
517        ASSERT(min <= max);
518        ASSERT(m_alternative->m_terms.size());
519
520        if (!max) {
521            m_alternative->removeLastTerm();
522            return;
523        }
524
525        PatternTerm& term = m_alternative->lastTerm();
526        ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
527        ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
528
529        if (term.type == PatternTerm::TypeParentheticalAssertion) {
530            // If an assertion is quantified with a minimum count of zero, it can simply be removed.
531            // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
532            // results in any input being consumed, however the continuation passed to the assertion
533            // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
534            // reject all zero length matches (see step 2.1). A match from the continuation of the
535            // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
536            // this is that matches from the assertion are not required, and won't be accepted anyway,
537            // so no need to ever run it.
538            if (!min)
539                m_alternative->removeLastTerm();
540            // We never need to run an assertion more than once. Subsequent interations will be run
541            // with the same start index (since assertions are non-capturing) and the same captures
542            // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
543            // same result and captures. If the first match succeeds then the subsequent (min - 1)
544            // matches will too. Any additional optional matches will fail (on the same basis as the
545            // minimum zero quantified assertions, above), but this will still result in a match.
546            return;
547        }
548
549        if (min == 0)
550            term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
551        else if (min == max)
552            term.quantify(min, QuantifierFixedCount);
553        else {
554            term.quantify(min, QuantifierFixedCount);
555            m_alternative->m_terms.append(copyTerm(term));
556            // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
557            m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
558            if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
559                m_alternative->lastTerm().parentheses.isCopy = true;
560        }
561    }
562
563    void disjunction()
564    {
565        m_alternative = m_alternative->m_parent->addNewAlternative();
566    }
567
568    unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
569    {
570        alternative->m_hasFixedSize = true;
571        Checked<unsigned> currentInputPosition = initialInputPosition;
572
573        for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
574            PatternTerm& term = alternative->m_terms[i];
575
576            switch (term.type) {
577            case PatternTerm::TypeAssertionBOL:
578            case PatternTerm::TypeAssertionEOL:
579            case PatternTerm::TypeAssertionWordBoundary:
580                term.inputPosition = currentInputPosition.unsafeGet();
581                break;
582
583            case PatternTerm::TypeBackReference:
584                term.inputPosition = currentInputPosition.unsafeGet();
585                term.frameLocation = currentCallFrameSize;
586                currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
587                alternative->m_hasFixedSize = false;
588                break;
589
590            case PatternTerm::TypeForwardReference:
591                break;
592
593            case PatternTerm::TypePatternCharacter:
594                term.inputPosition = currentInputPosition.unsafeGet();
595                if (term.quantityType != QuantifierFixedCount) {
596                    term.frameLocation = currentCallFrameSize;
597                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
598                    alternative->m_hasFixedSize = false;
599                } else
600                    currentInputPosition += term.quantityCount;
601                break;
602
603            case PatternTerm::TypeCharacterClass:
604                term.inputPosition = currentInputPosition.unsafeGet();
605                if (term.quantityType != QuantifierFixedCount) {
606                    term.frameLocation = currentCallFrameSize;
607                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
608                    alternative->m_hasFixedSize = false;
609                } else
610                    currentInputPosition += term.quantityCount;
611                break;
612
613            case PatternTerm::TypeParenthesesSubpattern:
614                // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
615                term.frameLocation = currentCallFrameSize;
616                if (term.quantityCount == 1 && !term.parentheses.isCopy) {
617                    if (term.quantityType != QuantifierFixedCount)
618                        currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
619                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
620                    // If quantity is fixed, then pre-check its minimum size.
621                    if (term.quantityType == QuantifierFixedCount)
622                        currentInputPosition += term.parentheses.disjunction->m_minimumSize;
623                    term.inputPosition = currentInputPosition.unsafeGet();
624                } else if (term.parentheses.isTerminal) {
625                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
626                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
627                    term.inputPosition = currentInputPosition.unsafeGet();
628                } else {
629                    term.inputPosition = currentInputPosition.unsafeGet();
630                    setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
631                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
632                }
633                // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
634                alternative->m_hasFixedSize = false;
635                break;
636
637            case PatternTerm::TypeParentheticalAssertion:
638                term.inputPosition = currentInputPosition.unsafeGet();
639                term.frameLocation = currentCallFrameSize;
640                currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
641                break;
642
643            case PatternTerm::TypeDotStarEnclosure:
644                alternative->m_hasFixedSize = false;
645                term.inputPosition = initialInputPosition;
646                break;
647            }
648        }
649
650        alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
651        return currentCallFrameSize;
652    }
653
654    unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
655    {
656        if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
657            initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
658
659        unsigned minimumInputSize = UINT_MAX;
660        unsigned maximumCallFrameSize = 0;
661        bool hasFixedSize = true;
662
663        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
664            PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
665            unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
666            minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
667            maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
668            hasFixedSize &= alternative->m_hasFixedSize;
669            if (alternative->m_minimumSize > INT_MAX)
670                m_pattern.m_containsUnsignedLengthPattern = true;
671        }
672
673        ASSERT(minimumInputSize != UINT_MAX);
674        ASSERT(maximumCallFrameSize >= initialCallFrameSize);
675
676        disjunction->m_hasFixedSize = hasFixedSize;
677        disjunction->m_minimumSize = minimumInputSize;
678        disjunction->m_callFrameSize = maximumCallFrameSize;
679        return maximumCallFrameSize;
680    }
681
682    void setupOffsets()
683    {
684        setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
685    }
686
687    // This optimization identifies sets of parentheses that we will never need to backtrack.
688    // In these cases we do not need to store state from prior iterations.
689    // We can presently avoid backtracking for:
690    //   * where the parens are at the end of the regular expression (last term in any of the
691    //     alternatives of the main body disjunction).
692    //   * where the parens are non-capturing, and quantified unbounded greedy (*).
693    //   * where the parens do not contain any capturing subpatterns.
694    void checkForTerminalParentheses()
695    {
696        // This check is much too crude; should be just checking whether the candidate
697        // node contains nested capturing subpatterns, not the whole expression!
698        if (m_pattern.m_numSubpatterns)
699            return;
700
701        Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
702        for (size_t i = 0; i < alternatives.size(); ++i) {
703            Vector<PatternTerm>& terms = alternatives[i]->m_terms;
704            if (terms.size()) {
705                PatternTerm& term = terms.last();
706                if (term.type == PatternTerm::TypeParenthesesSubpattern
707                    && term.quantityType == QuantifierGreedy
708                    && term.quantityCount == quantifyInfinite
709                    && !term.capture())
710                    term.parentheses.isTerminal = true;
711            }
712        }
713    }
714
715    void optimizeBOL()
716    {
717        // Look for expressions containing beginning of line (^) anchoring and unroll them.
718        // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
719        // This code relies on the parsing code tagging alternatives with m_containsBOL and
720        // m_startsWithBOL and rolling those up to containing alternatives.
721        // At this point, this is only valid for non-multiline expressions.
722        PatternDisjunction* disjunction = m_pattern.m_body;
723
724        if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
725            return;
726
727        PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
728
729        // Set alternatives in disjunction to "onceThrough"
730        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
731            disjunction->m_alternatives[alt]->setOnceThrough();
732
733        if (loopDisjunction) {
734            // Move alternatives from loopDisjunction to disjunction
735            for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
736                disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
737
738            loopDisjunction->m_alternatives.clear();
739        }
740    }
741
742    bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
743    {
744        Vector<PatternTerm>& terms = alternative->m_terms;
745
746        for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
747            PatternTerm& term = terms[termIndex];
748
749            if (term.m_capture)
750                return true;
751
752            if (term.type == PatternTerm::TypeParenthesesSubpattern) {
753                PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
754                for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
755                    if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
756                        return true;
757                }
758            }
759        }
760
761        return false;
762    }
763
764    // This optimization identifies alternatives in the form of
765    // [^].*[?]<expression>.*[$] for expressions that don't have any
766    // capturing terms. The alternative is changed to <expression>
767    // followed by processing of the dot stars to find and adjust the
768    // beginning and the end of the match.
769    void optimizeDotStarWrappedExpressions()
770    {
771        Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
772        if (alternatives.size() != 1)
773            return;
774
775        PatternAlternative* alternative = alternatives[0].get();
776        Vector<PatternTerm>& terms = alternative->m_terms;
777        if (terms.size() >= 3) {
778            bool startsWithBOL = false;
779            bool endsWithEOL = false;
780            size_t termIndex, firstExpressionTerm, lastExpressionTerm;
781
782            termIndex = 0;
783            if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
784                startsWithBOL = true;
785                ++termIndex;
786            }
787
788            PatternTerm& firstNonAnchorTerm = terms[termIndex];
789            if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
790                return;
791
792            firstExpressionTerm = termIndex + 1;
793
794            termIndex = terms.size() - 1;
795            if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
796                endsWithEOL = true;
797                --termIndex;
798            }
799
800            PatternTerm& lastNonAnchorTerm = terms[termIndex];
801            if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
802                return;
803
804            lastExpressionTerm = termIndex - 1;
805
806            if (firstExpressionTerm > lastExpressionTerm)
807                return;
808
809            if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
810                for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
811                    terms.remove(termIndex);
812
813                for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
814                    terms.remove(termIndex - 1);
815
816                terms.append(PatternTerm(startsWithBOL, endsWithEOL));
817
818                m_pattern.m_containsBOL = false;
819            }
820        }
821    }
822
823private:
824    YarrPattern& m_pattern;
825    PatternAlternative* m_alternative;
826    CharacterClassConstructor m_characterClassConstructor;
827    bool m_invertCharacterClass;
828    bool m_invertParentheticalAssertion;
829};
830
831const char* YarrPattern::compile(const String& patternString)
832{
833    YarrPatternConstructor constructor(*this);
834
835    if (const char* error = parse(constructor, patternString))
836        return error;
837
838    // If the pattern contains illegal backreferences reset & reparse.
839    // Quoting Netscape's "What's new in JavaScript 1.2",
840    //      "Note: if the number of left parentheses is less than the number specified
841    //       in \#, the \# is taken as an octal escape as described in the next row."
842    if (containsIllegalBackReference()) {
843        unsigned numSubpatterns = m_numSubpatterns;
844
845        constructor.reset();
846#if !ASSERT_DISABLED
847        const char* error =
848#endif
849            parse(constructor, patternString, numSubpatterns);
850
851        ASSERT(!error);
852        ASSERT(numSubpatterns == m_numSubpatterns);
853    }
854
855    constructor.checkForTerminalParentheses();
856    constructor.optimizeDotStarWrappedExpressions();
857    constructor.optimizeBOL();
858
859    constructor.setupOffsets();
860
861    return 0;
862}
863
864YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
865    : m_ignoreCase(ignoreCase)
866    , m_multiline(multiline)
867    , m_containsBackreferences(false)
868    , m_containsBOL(false)
869    , m_containsUnsignedLengthPattern(false)
870    , m_numSubpatterns(0)
871    , m_maxBackReference(0)
872    , newlineCached(0)
873    , digitsCached(0)
874    , spacesCached(0)
875    , wordcharCached(0)
876    , nondigitsCached(0)
877    , nonspacesCached(0)
878    , nonwordcharCached(0)
879{
880    *error = compile(pattern);
881}
882
883} }
884