1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef YarrParser_h
27#define YarrParser_h
28
29#include "Yarr.h"
30#include <wtf/ASCIICType.h>
31#include <wtf/text/WTFString.h>
32
33namespace JSC { namespace Yarr {
34
35#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
36
37enum BuiltInCharacterClassID {
38    DigitClassID,
39    SpaceClassID,
40    WordClassID,
41    NewlineClassID,
42};
43
44// The Parser class should not be used directly - only via the Yarr::parse() method.
45template<class Delegate, typename CharType>
46class Parser {
47private:
48    template<class FriendDelegate>
49    friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
50
51    enum ErrorCode {
52        NoError,
53        PatternTooLarge,
54        QuantifierOutOfOrder,
55        QuantifierWithoutAtom,
56        QuantifierTooLarge,
57        MissingParentheses,
58        ParenthesesUnmatched,
59        ParenthesesTypeInvalid,
60        CharacterClassUnmatched,
61        CharacterClassOutOfOrder,
62        EscapeUnterminated,
63        NumberOfErrorCodes
64    };
65
66    /*
67     * CharacterClassParserDelegate:
68     *
69     * The class CharacterClassParserDelegate is used in the parsing of character
70     * classes.  This class handles detection of character ranges.  This class
71     * implements enough of the delegate interface such that it can be passed to
72     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
73     * to perform the parsing of escape characters in character sets.
74     */
75    class CharacterClassParserDelegate {
76    public:
77        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
78            : m_delegate(delegate)
79            , m_err(err)
80            , m_state(Empty)
81            , m_character(0)
82        {
83        }
84
85        /*
86         * begin():
87         *
88         * Called at beginning of construction.
89         */
90        void begin(bool invert)
91        {
92            m_delegate.atomCharacterClassBegin(invert);
93        }
94
95        /*
96         * atomPatternCharacter():
97         *
98         * This method is called either from parseCharacterClass() (for an unescaped
99         * character in a character class), or from parseEscape(). In the former case
100         * the value true will be passed for the argument 'hyphenIsRange', and in this
101         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102         * is different to /[a\-z]/).
103         */
104        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
105        {
106            switch (m_state) {
107            case AfterCharacterClass:
108                // Following a builtin character class we need look out for a hyphen.
109                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110                // If we see a hyphen following a charater class then unlike usual
111                // we'll report it to the delegate immediately, and put ourself into
112                // a poisoned state. Any following calls to add another character or
113                // character class will result in an error. (A hypen following a
114                // character-class is itself valid, but only  at the end of a regex).
115                if (hyphenIsRange && ch == '-') {
116                    m_delegate.atomCharacterClassAtom('-');
117                    m_state = AfterCharacterClassHyphen;
118                    return;
119                }
120                // Otherwise just fall through - cached character so treat this as Empty.
121                FALLTHROUGH;
122
123            case Empty:
124                m_character = ch;
125                m_state = CachedCharacter;
126                return;
127
128            case CachedCharacter:
129                if (hyphenIsRange && ch == '-')
130                    m_state = CachedCharacterHyphen;
131                else {
132                    m_delegate.atomCharacterClassAtom(m_character);
133                    m_character = ch;
134                }
135                return;
136
137            case CachedCharacterHyphen:
138                if (ch < m_character) {
139                    m_err = CharacterClassOutOfOrder;
140                    return;
141                }
142                m_delegate.atomCharacterClassRange(m_character, ch);
143                m_state = Empty;
144                return;
145
146                // See coment in atomBuiltInCharacterClass below.
147                // This too is technically an error, per ECMA-262, and again we
148                // we chose to allow this.  Note a subtlely here that while we
149                // diverge from the spec's definition of CharacterRange we do
150                // remain in compliance with the grammar.  For example, consider
151                // the expression /[\d-a-z]/.  We comply with the grammar in
152                // this case by not allowing a-z to be matched as a range.
153            case AfterCharacterClassHyphen:
154                m_delegate.atomCharacterClassAtom(ch);
155                m_state = Empty;
156                return;
157            }
158        }
159
160        /*
161         * atomBuiltInCharacterClass():
162         *
163         * Adds a built-in character class, called by parseEscape().
164         */
165        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
166        {
167            switch (m_state) {
168            case CachedCharacter:
169                // Flush the currently cached character, then fall through.
170                m_delegate.atomCharacterClassAtom(m_character);
171                FALLTHROUGH;
172            case Empty:
173            case AfterCharacterClass:
174                m_state = AfterCharacterClass;
175                m_delegate.atomCharacterClassBuiltIn(classID, invert);
176                return;
177
178                // If we hit either of these cases, we have an invalid range that
179                // looks something like /[x-\d]/ or /[\d-\d]/.
180                // According to ECMA-262 this should be a syntax error, but
181                // empirical testing shows this to break teh webz.  Instead we
182                // comply with to the ECMA-262 grammar, and assume the grammar to
183                // have matched the range correctly, but tweak our interpretation
184                // of CharacterRange.  Effectively we implicitly handle the hyphen
185                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
186            case CachedCharacterHyphen:
187                m_delegate.atomCharacterClassAtom(m_character);
188                m_delegate.atomCharacterClassAtom('-');
189                FALLTHROUGH;
190            case AfterCharacterClassHyphen:
191                m_delegate.atomCharacterClassBuiltIn(classID, invert);
192                m_state = Empty;
193                return;
194            }
195        }
196
197        /*
198         * end():
199         *
200         * Called at end of construction.
201         */
202        void end()
203        {
204            if (m_state == CachedCharacter)
205                m_delegate.atomCharacterClassAtom(m_character);
206            else if (m_state == CachedCharacterHyphen) {
207                m_delegate.atomCharacterClassAtom(m_character);
208                m_delegate.atomCharacterClassAtom('-');
209            }
210            m_delegate.atomCharacterClassEnd();
211        }
212
213        // parseEscape() should never call these delegate methods when
214        // invoked with inCharacterClass set.
215        NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
216        NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
217
218    private:
219        Delegate& m_delegate;
220        ErrorCode& m_err;
221        enum CharacterClassConstructionState {
222            Empty,
223            CachedCharacter,
224            CachedCharacterHyphen,
225            AfterCharacterClass,
226            AfterCharacterClassHyphen,
227        } m_state;
228        UChar m_character;
229    };
230
231    Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
232        : m_delegate(delegate)
233        , m_backReferenceLimit(backReferenceLimit)
234        , m_err(NoError)
235        , m_data(pattern.characters<CharType>())
236        , m_size(pattern.length())
237        , m_index(0)
238        , m_parenthesesNestingDepth(0)
239    {
240    }
241
242    /*
243     * parseEscape():
244     *
245     * Helper for parseTokens() AND parseCharacterClass().
246     * Unlike the other parser methods, this function does not report tokens
247     * directly to the member delegate (m_delegate), instead tokens are
248     * emitted to the delegate provided as an argument.  In the case of atom
249     * escapes, parseTokens() will call parseEscape() passing m_delegate as
250     * an argument, and as such the escape will be reported to the delegate.
251     *
252     * However this method may also be used by parseCharacterClass(), in which
253     * case a CharacterClassParserDelegate will be passed as the delegate that
254     * tokens should be added to.  A boolean flag is also provided to indicate
255     * whether that an escape in a CharacterClass is being parsed (some parsing
256     * rules change in this context).
257     *
258     * The boolean value returned by this method indicates whether the token
259     * parsed was an atom (outside of a characted class \b and \B will be
260     * interpreted as assertions).
261     */
262    template<bool inCharacterClass, class EscapeDelegate>
263    bool parseEscape(EscapeDelegate& delegate)
264    {
265        ASSERT(!m_err);
266        ASSERT(peek() == '\\');
267        consume();
268
269        if (atEndOfPattern()) {
270            m_err = EscapeUnterminated;
271            return false;
272        }
273
274        switch (peek()) {
275        // Assertions
276        case 'b':
277            consume();
278            if (inCharacterClass)
279                delegate.atomPatternCharacter('\b');
280            else {
281                delegate.assertionWordBoundary(false);
282                return false;
283            }
284            break;
285        case 'B':
286            consume();
287            if (inCharacterClass)
288                delegate.atomPatternCharacter('B');
289            else {
290                delegate.assertionWordBoundary(true);
291                return false;
292            }
293            break;
294
295        // CharacterClassEscape
296        case 'd':
297            consume();
298            delegate.atomBuiltInCharacterClass(DigitClassID, false);
299            break;
300        case 's':
301            consume();
302            delegate.atomBuiltInCharacterClass(SpaceClassID, false);
303            break;
304        case 'w':
305            consume();
306            delegate.atomBuiltInCharacterClass(WordClassID, false);
307            break;
308        case 'D':
309            consume();
310            delegate.atomBuiltInCharacterClass(DigitClassID, true);
311            break;
312        case 'S':
313            consume();
314            delegate.atomBuiltInCharacterClass(SpaceClassID, true);
315            break;
316        case 'W':
317            consume();
318            delegate.atomBuiltInCharacterClass(WordClassID, true);
319            break;
320
321        // DecimalEscape
322        case '1':
323        case '2':
324        case '3':
325        case '4':
326        case '5':
327        case '6':
328        case '7':
329        case '8':
330        case '9': {
331            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332            // First, try to parse this as backreference.
333            if (!inCharacterClass) {
334                ParseState state = saveState();
335
336                unsigned backReference = consumeNumber();
337                if (backReference <= m_backReferenceLimit) {
338                    delegate.atomBackReference(backReference);
339                    break;
340                }
341
342                restoreState(state);
343            }
344
345            // Not a backreference, and not octal.
346            if (peek() >= '8') {
347                delegate.atomPatternCharacter('\\');
348                break;
349            }
350
351            // Fall-through to handle this as an octal escape.
352            FALLTHROUGH;
353        }
354
355        // Octal escape
356        case '0':
357            delegate.atomPatternCharacter(consumeOctal());
358            break;
359
360        // ControlEscape
361        case 'f':
362            consume();
363            delegate.atomPatternCharacter('\f');
364            break;
365        case 'n':
366            consume();
367            delegate.atomPatternCharacter('\n');
368            break;
369        case 'r':
370            consume();
371            delegate.atomPatternCharacter('\r');
372            break;
373        case 't':
374            consume();
375            delegate.atomPatternCharacter('\t');
376            break;
377        case 'v':
378            consume();
379            delegate.atomPatternCharacter('\v');
380            break;
381
382        // ControlLetter
383        case 'c': {
384            ParseState state = saveState();
385            consume();
386            if (!atEndOfPattern()) {
387                int control = consume();
388
389                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
390                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
391                    delegate.atomPatternCharacter(control & 0x1f);
392                    break;
393                }
394            }
395            restoreState(state);
396            delegate.atomPatternCharacter('\\');
397            break;
398        }
399
400        // HexEscape
401        case 'x': {
402            consume();
403            int x = tryConsumeHex(2);
404            if (x == -1)
405                delegate.atomPatternCharacter('x');
406            else
407                delegate.atomPatternCharacter(x);
408            break;
409        }
410
411        // UnicodeEscape
412        case 'u': {
413            consume();
414            int u = tryConsumeHex(4);
415            if (u == -1)
416                delegate.atomPatternCharacter('u');
417            else
418                delegate.atomPatternCharacter(u);
419            break;
420        }
421
422        // IdentityEscape
423        default:
424            delegate.atomPatternCharacter(consume());
425        }
426
427        return true;
428    }
429
430    /*
431     * parseAtomEscape(), parseCharacterClassEscape():
432     *
433     * These methods alias to parseEscape().
434     */
435    bool parseAtomEscape()
436    {
437        return parseEscape<false>(m_delegate);
438    }
439    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
440    {
441        parseEscape<true>(delegate);
442    }
443
444    /*
445     * parseCharacterClass():
446     *
447     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
448     * to an instance of CharacterClassParserDelegate, to describe the character class to the
449     * delegate.
450     */
451    void parseCharacterClass()
452    {
453        ASSERT(!m_err);
454        ASSERT(peek() == '[');
455        consume();
456
457        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
458
459        characterClassConstructor.begin(tryConsume('^'));
460
461        while (!atEndOfPattern()) {
462            switch (peek()) {
463            case ']':
464                consume();
465                characterClassConstructor.end();
466                return;
467
468            case '\\':
469                parseCharacterClassEscape(characterClassConstructor);
470                break;
471
472            default:
473                characterClassConstructor.atomPatternCharacter(consume(), true);
474            }
475
476            if (m_err)
477                return;
478        }
479
480        m_err = CharacterClassUnmatched;
481    }
482
483    /*
484     * parseParenthesesBegin():
485     *
486     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
487     */
488    void parseParenthesesBegin()
489    {
490        ASSERT(!m_err);
491        ASSERT(peek() == '(');
492        consume();
493
494        if (tryConsume('?')) {
495            if (atEndOfPattern()) {
496                m_err = ParenthesesTypeInvalid;
497                return;
498            }
499
500            switch (consume()) {
501            case ':':
502                m_delegate.atomParenthesesSubpatternBegin(false);
503                break;
504
505            case '=':
506                m_delegate.atomParentheticalAssertionBegin();
507                break;
508
509            case '!':
510                m_delegate.atomParentheticalAssertionBegin(true);
511                break;
512
513            default:
514                m_err = ParenthesesTypeInvalid;
515            }
516        } else
517            m_delegate.atomParenthesesSubpatternBegin();
518
519        ++m_parenthesesNestingDepth;
520    }
521
522    /*
523     * parseParenthesesEnd():
524     *
525     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
526     */
527    void parseParenthesesEnd()
528    {
529        ASSERT(!m_err);
530        ASSERT(peek() == ')');
531        consume();
532
533        if (m_parenthesesNestingDepth > 0)
534            m_delegate.atomParenthesesEnd();
535        else
536            m_err = ParenthesesUnmatched;
537
538        --m_parenthesesNestingDepth;
539    }
540
541    /*
542     * parseQuantifier():
543     *
544     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
545     */
546    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
547    {
548        ASSERT(!m_err);
549        ASSERT(min <= max);
550
551        if (min == UINT_MAX) {
552            m_err = QuantifierTooLarge;
553            return;
554        }
555
556        if (lastTokenWasAnAtom)
557            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
558        else
559            m_err = QuantifierWithoutAtom;
560    }
561
562    /*
563     * parseTokens():
564     *
565     * This method loops over the input pattern reporting tokens to the delegate.
566     * The method returns when a parse error is detected, or the end of the pattern
567     * is reached.  One piece of state is tracked around the loop, which is whether
568     * the last token passed to the delegate was an atom (this is necessary to detect
569     * a parse error when a quantifier provided without an atom to quantify).
570     */
571    void parseTokens()
572    {
573        bool lastTokenWasAnAtom = false;
574
575        while (!atEndOfPattern()) {
576            switch (peek()) {
577            case '|':
578                consume();
579                m_delegate.disjunction();
580                lastTokenWasAnAtom = false;
581                break;
582
583            case '(':
584                parseParenthesesBegin();
585                lastTokenWasAnAtom = false;
586                break;
587
588            case ')':
589                parseParenthesesEnd();
590                lastTokenWasAnAtom = true;
591                break;
592
593            case '^':
594                consume();
595                m_delegate.assertionBOL();
596                lastTokenWasAnAtom = false;
597                break;
598
599            case '$':
600                consume();
601                m_delegate.assertionEOL();
602                lastTokenWasAnAtom = false;
603                break;
604
605            case '.':
606                consume();
607                m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
608                lastTokenWasAnAtom = true;
609                break;
610
611            case '[':
612                parseCharacterClass();
613                lastTokenWasAnAtom = true;
614                break;
615
616            case '\\':
617                lastTokenWasAnAtom = parseAtomEscape();
618                break;
619
620            case '*':
621                consume();
622                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
623                lastTokenWasAnAtom = false;
624                break;
625
626            case '+':
627                consume();
628                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
629                lastTokenWasAnAtom = false;
630                break;
631
632            case '?':
633                consume();
634                parseQuantifier(lastTokenWasAnAtom, 0, 1);
635                lastTokenWasAnAtom = false;
636                break;
637
638            case '{': {
639                ParseState state = saveState();
640
641                consume();
642                if (peekIsDigit()) {
643                    unsigned min = consumeNumber();
644                    unsigned max = min;
645
646                    if (tryConsume(','))
647                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
648
649                    if (tryConsume('}')) {
650                        if (min <= max)
651                            parseQuantifier(lastTokenWasAnAtom, min, max);
652                        else
653                            m_err = QuantifierOutOfOrder;
654                        lastTokenWasAnAtom = false;
655                        break;
656                    }
657                }
658
659                restoreState(state);
660            }
661            // if we did not find a complete quantifer, fall through to the default case.
662            FALLTHROUGH;
663
664            default:
665                m_delegate.atomPatternCharacter(consume());
666                lastTokenWasAnAtom = true;
667            }
668
669            if (m_err)
670                return;
671        }
672
673        if (m_parenthesesNestingDepth > 0)
674            m_err = MissingParentheses;
675    }
676
677    /*
678     * parse():
679     *
680     * This method calls parseTokens() to parse over the input and converts any
681     * error code to a const char* for a result.
682     */
683    const char* parse()
684    {
685        if (m_size > MAX_PATTERN_SIZE)
686            m_err = PatternTooLarge;
687        else
688            parseTokens();
689        ASSERT(atEndOfPattern() || m_err);
690
691        // The order of this array must match the ErrorCode enum.
692        static const char* errorMessages[NumberOfErrorCodes] = {
693            0, // NoError
694            REGEXP_ERROR_PREFIX "regular expression too large",
695            REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
696            REGEXP_ERROR_PREFIX "nothing to repeat",
697            REGEXP_ERROR_PREFIX "number too large in {} quantifier",
698            REGEXP_ERROR_PREFIX "missing )",
699            REGEXP_ERROR_PREFIX "unmatched parentheses",
700            REGEXP_ERROR_PREFIX "unrecognized character after (?",
701            REGEXP_ERROR_PREFIX "missing terminating ] for character class",
702            REGEXP_ERROR_PREFIX "range out of order in character class",
703            REGEXP_ERROR_PREFIX "\\ at end of pattern"
704        };
705
706        return errorMessages[m_err];
707    }
708
709    // Misc helper functions:
710
711    typedef unsigned ParseState;
712
713    ParseState saveState()
714    {
715        return m_index;
716    }
717
718    void restoreState(ParseState state)
719    {
720        m_index = state;
721    }
722
723    bool atEndOfPattern()
724    {
725        ASSERT(m_index <= m_size);
726        return m_index == m_size;
727    }
728
729    int peek()
730    {
731        ASSERT(m_index < m_size);
732        return m_data[m_index];
733    }
734
735    bool peekIsDigit()
736    {
737        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
738    }
739
740    unsigned peekDigit()
741    {
742        ASSERT(peekIsDigit());
743        return peek() - '0';
744    }
745
746    int consume()
747    {
748        ASSERT(m_index < m_size);
749        return m_data[m_index++];
750    }
751
752    unsigned consumeDigit()
753    {
754        ASSERT(peekIsDigit());
755        return consume() - '0';
756    }
757
758    unsigned consumeNumber()
759    {
760        unsigned n = consumeDigit();
761        // check for overflow.
762        for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
763            n = newValue;
764            consume();
765        }
766        return n;
767    }
768
769    unsigned consumeOctal()
770    {
771        ASSERT(WTF::isASCIIOctalDigit(peek()));
772
773        unsigned n = consumeDigit();
774        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
775            n = n * 8 + consumeDigit();
776        return n;
777    }
778
779    bool tryConsume(UChar ch)
780    {
781        if (atEndOfPattern() || (m_data[m_index] != ch))
782            return false;
783        ++m_index;
784        return true;
785    }
786
787    int tryConsumeHex(int count)
788    {
789        ParseState state = saveState();
790
791        int n = 0;
792        while (count--) {
793            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
794                restoreState(state);
795                return -1;
796            }
797            n = (n << 4) | WTF::toASCIIHexValue(consume());
798        }
799        return n;
800    }
801
802    Delegate& m_delegate;
803    unsigned m_backReferenceLimit;
804    ErrorCode m_err;
805    const CharType* m_data;
806    unsigned m_size;
807    unsigned m_index;
808    unsigned m_parenthesesNestingDepth;
809
810    // Derived by empirical testing of compile time in PCRE and WREC.
811    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
812};
813
814/*
815 * Yarr::parse():
816 *
817 * The parse method is passed a pattern to be parsed and a delegate upon which
818 * callbacks will be made to record the parsed tokens forming the regex.
819 * Yarr::parse() returns null on success, or a const C string providing an error
820 * message where a parse error occurs.
821 *
822 * The Delegate must implement the following interface:
823 *
824 *    void assertionBOL();
825 *    void assertionEOL();
826 *    void assertionWordBoundary(bool invert);
827 *
828 *    void atomPatternCharacter(UChar ch);
829 *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
830 *    void atomCharacterClassBegin(bool invert)
831 *    void atomCharacterClassAtom(UChar ch)
832 *    void atomCharacterClassRange(UChar begin, UChar end)
833 *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
834 *    void atomCharacterClassEnd()
835 *    void atomParenthesesSubpatternBegin(bool capture = true);
836 *    void atomParentheticalAssertionBegin(bool invert = false);
837 *    void atomParenthesesEnd();
838 *    void atomBackReference(unsigned subpatternId);
839 *
840 *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
841 *
842 *    void disjunction();
843 *
844 * The regular expression is described by a sequence of assertion*() and atom*()
845 * callbacks to the delegate, describing the terms in the regular expression.
846 * Following an atom a quantifyAtom() call may occur to indicate that the previous
847 * atom should be quantified.  In the case of atoms described across multiple
848 * calls (parentheses and character classes) the call to quantifyAtom() will come
849 * after the call to the atom*End() method, never after atom*Begin().
850 *
851 * Character classes may either be described by a single call to
852 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
853 * In the latter case, ...Begin() will be called, followed by a sequence of
854 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
855 *
856 * Sequences of atoms and assertions are broken into alternatives via calls to
857 * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
858 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
859 * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
860 * capturing subpattern, this will be the subpatternId associated with these
861 * parentheses, and will also by definition be the lowest subpatternId of these
862 * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
863 * is passed the subpatternId of the last capturing subexpression nested within
864 * these paretheses.  In the case of a capturing subpattern with no nested
865 * capturing subpatterns, the same subpatternId will be passed to the begin and
866 * end functions.  In the case of non-capturing subpatterns the subpatternId
867 * passed to the begin method is also the first possible subpatternId that might
868 * be nested within these paretheses.  If a set of non-capturing parentheses does
869 * not contain any capturing subpatterns, then the subpatternId passed to begin
870 * will be greater than the subpatternId passed to end.
871 */
872
873template<class Delegate>
874const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
875{
876    if (pattern.is8Bit())
877        return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
878    return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
879}
880
881} } // namespace JSC::Yarr
882
883#endif // YarrParser_h
884