1/* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#ifndef YarrParser_h 27#define YarrParser_h 28 29#include "Yarr.h" 30#include <wtf/ASCIICType.h> 31#include <wtf/text/WTFString.h> 32 33namespace JSC { namespace Yarr { 34 35#define REGEXP_ERROR_PREFIX "Invalid regular expression: " 36 37enum BuiltInCharacterClassID { 38 DigitClassID, 39 SpaceClassID, 40 WordClassID, 41 NewlineClassID, 42}; 43 44// The Parser class should not be used directly - only via the Yarr::parse() method. 45template<class Delegate, typename CharType> 46class Parser { 47private: 48 template<class FriendDelegate> 49 friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); 50 51 enum ErrorCode { 52 NoError, 53 PatternTooLarge, 54 QuantifierOutOfOrder, 55 QuantifierWithoutAtom, 56 QuantifierTooLarge, 57 MissingParentheses, 58 ParenthesesUnmatched, 59 ParenthesesTypeInvalid, 60 CharacterClassUnmatched, 61 CharacterClassOutOfOrder, 62 EscapeUnterminated, 63 NumberOfErrorCodes 64 }; 65 66 /* 67 * CharacterClassParserDelegate: 68 * 69 * The class CharacterClassParserDelegate is used in the parsing of character 70 * classes. This class handles detection of character ranges. This class 71 * implements enough of the delegate interface such that it can be passed to 72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused 73 * to perform the parsing of escape characters in character sets. 74 */ 75 class CharacterClassParserDelegate { 76 public: 77 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) 78 : m_delegate(delegate) 79 , m_err(err) 80 , m_state(Empty) 81 , m_character(0) 82 { 83 } 84 85 /* 86 * begin(): 87 * 88 * Called at beginning of construction. 89 */ 90 void begin(bool invert) 91 { 92 m_delegate.atomCharacterClassBegin(invert); 93 } 94 95 /* 96 * atomPatternCharacter(): 97 * 98 * This method is called either from parseCharacterClass() (for an unescaped 99 * character in a character class), or from parseEscape(). In the former case 100 * the value true will be passed for the argument 'hyphenIsRange', and in this 101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ 102 * is different to /[a\-z]/). 103 */ 104 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) 105 { 106 switch (m_state) { 107 case AfterCharacterClass: 108 // Following a builtin character class we need look out for a hyphen. 109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. 110 // If we see a hyphen following a charater class then unlike usual 111 // we'll report it to the delegate immediately, and put ourself into 112 // a poisoned state. Any following calls to add another character or 113 // character class will result in an error. (A hypen following a 114 // character-class is itself valid, but only at the end of a regex). 115 if (hyphenIsRange && ch == '-') { 116 m_delegate.atomCharacterClassAtom('-'); 117 m_state = AfterCharacterClassHyphen; 118 return; 119 } 120 // Otherwise just fall through - cached character so treat this as Empty. 121 FALLTHROUGH; 122 123 case Empty: 124 m_character = ch; 125 m_state = CachedCharacter; 126 return; 127 128 case CachedCharacter: 129 if (hyphenIsRange && ch == '-') 130 m_state = CachedCharacterHyphen; 131 else { 132 m_delegate.atomCharacterClassAtom(m_character); 133 m_character = ch; 134 } 135 return; 136 137 case CachedCharacterHyphen: 138 if (ch < m_character) { 139 m_err = CharacterClassOutOfOrder; 140 return; 141 } 142 m_delegate.atomCharacterClassRange(m_character, ch); 143 m_state = Empty; 144 return; 145 146 // See coment in atomBuiltInCharacterClass below. 147 // This too is technically an error, per ECMA-262, and again we 148 // we chose to allow this. Note a subtlely here that while we 149 // diverge from the spec's definition of CharacterRange we do 150 // remain in compliance with the grammar. For example, consider 151 // the expression /[\d-a-z]/. We comply with the grammar in 152 // this case by not allowing a-z to be matched as a range. 153 case AfterCharacterClassHyphen: 154 m_delegate.atomCharacterClassAtom(ch); 155 m_state = Empty; 156 return; 157 } 158 } 159 160 /* 161 * atomBuiltInCharacterClass(): 162 * 163 * Adds a built-in character class, called by parseEscape(). 164 */ 165 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 166 { 167 switch (m_state) { 168 case CachedCharacter: 169 // Flush the currently cached character, then fall through. 170 m_delegate.atomCharacterClassAtom(m_character); 171 FALLTHROUGH; 172 case Empty: 173 case AfterCharacterClass: 174 m_state = AfterCharacterClass; 175 m_delegate.atomCharacterClassBuiltIn(classID, invert); 176 return; 177 178 // If we hit either of these cases, we have an invalid range that 179 // looks something like /[x-\d]/ or /[\d-\d]/. 180 // According to ECMA-262 this should be a syntax error, but 181 // empirical testing shows this to break teh webz. Instead we 182 // comply with to the ECMA-262 grammar, and assume the grammar to 183 // have matched the range correctly, but tweak our interpretation 184 // of CharacterRange. Effectively we implicitly handle the hyphen 185 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. 186 case CachedCharacterHyphen: 187 m_delegate.atomCharacterClassAtom(m_character); 188 m_delegate.atomCharacterClassAtom('-'); 189 FALLTHROUGH; 190 case AfterCharacterClassHyphen: 191 m_delegate.atomCharacterClassBuiltIn(classID, invert); 192 m_state = Empty; 193 return; 194 } 195 } 196 197 /* 198 * end(): 199 * 200 * Called at end of construction. 201 */ 202 void end() 203 { 204 if (m_state == CachedCharacter) 205 m_delegate.atomCharacterClassAtom(m_character); 206 else if (m_state == CachedCharacterHyphen) { 207 m_delegate.atomCharacterClassAtom(m_character); 208 m_delegate.atomCharacterClassAtom('-'); 209 } 210 m_delegate.atomCharacterClassEnd(); 211 } 212 213 // parseEscape() should never call these delegate methods when 214 // invoked with inCharacterClass set. 215 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } 216 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } 217 218 private: 219 Delegate& m_delegate; 220 ErrorCode& m_err; 221 enum CharacterClassConstructionState { 222 Empty, 223 CachedCharacter, 224 CachedCharacterHyphen, 225 AfterCharacterClass, 226 AfterCharacterClassHyphen, 227 } m_state; 228 UChar m_character; 229 }; 230 231 Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) 232 : m_delegate(delegate) 233 , m_backReferenceLimit(backReferenceLimit) 234 , m_err(NoError) 235 , m_data(pattern.characters<CharType>()) 236 , m_size(pattern.length()) 237 , m_index(0) 238 , m_parenthesesNestingDepth(0) 239 { 240 } 241 242 /* 243 * parseEscape(): 244 * 245 * Helper for parseTokens() AND parseCharacterClass(). 246 * Unlike the other parser methods, this function does not report tokens 247 * directly to the member delegate (m_delegate), instead tokens are 248 * emitted to the delegate provided as an argument. In the case of atom 249 * escapes, parseTokens() will call parseEscape() passing m_delegate as 250 * an argument, and as such the escape will be reported to the delegate. 251 * 252 * However this method may also be used by parseCharacterClass(), in which 253 * case a CharacterClassParserDelegate will be passed as the delegate that 254 * tokens should be added to. A boolean flag is also provided to indicate 255 * whether that an escape in a CharacterClass is being parsed (some parsing 256 * rules change in this context). 257 * 258 * The boolean value returned by this method indicates whether the token 259 * parsed was an atom (outside of a characted class \b and \B will be 260 * interpreted as assertions). 261 */ 262 template<bool inCharacterClass, class EscapeDelegate> 263 bool parseEscape(EscapeDelegate& delegate) 264 { 265 ASSERT(!m_err); 266 ASSERT(peek() == '\\'); 267 consume(); 268 269 if (atEndOfPattern()) { 270 m_err = EscapeUnterminated; 271 return false; 272 } 273 274 switch (peek()) { 275 // Assertions 276 case 'b': 277 consume(); 278 if (inCharacterClass) 279 delegate.atomPatternCharacter('\b'); 280 else { 281 delegate.assertionWordBoundary(false); 282 return false; 283 } 284 break; 285 case 'B': 286 consume(); 287 if (inCharacterClass) 288 delegate.atomPatternCharacter('B'); 289 else { 290 delegate.assertionWordBoundary(true); 291 return false; 292 } 293 break; 294 295 // CharacterClassEscape 296 case 'd': 297 consume(); 298 delegate.atomBuiltInCharacterClass(DigitClassID, false); 299 break; 300 case 's': 301 consume(); 302 delegate.atomBuiltInCharacterClass(SpaceClassID, false); 303 break; 304 case 'w': 305 consume(); 306 delegate.atomBuiltInCharacterClass(WordClassID, false); 307 break; 308 case 'D': 309 consume(); 310 delegate.atomBuiltInCharacterClass(DigitClassID, true); 311 break; 312 case 'S': 313 consume(); 314 delegate.atomBuiltInCharacterClass(SpaceClassID, true); 315 break; 316 case 'W': 317 consume(); 318 delegate.atomBuiltInCharacterClass(WordClassID, true); 319 break; 320 321 // DecimalEscape 322 case '1': 323 case '2': 324 case '3': 325 case '4': 326 case '5': 327 case '6': 328 case '7': 329 case '8': 330 case '9': { 331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. 332 // First, try to parse this as backreference. 333 if (!inCharacterClass) { 334 ParseState state = saveState(); 335 336 unsigned backReference = consumeNumber(); 337 if (backReference <= m_backReferenceLimit) { 338 delegate.atomBackReference(backReference); 339 break; 340 } 341 342 restoreState(state); 343 } 344 345 // Not a backreference, and not octal. 346 if (peek() >= '8') { 347 delegate.atomPatternCharacter('\\'); 348 break; 349 } 350 351 // Fall-through to handle this as an octal escape. 352 FALLTHROUGH; 353 } 354 355 // Octal escape 356 case '0': 357 delegate.atomPatternCharacter(consumeOctal()); 358 break; 359 360 // ControlEscape 361 case 'f': 362 consume(); 363 delegate.atomPatternCharacter('\f'); 364 break; 365 case 'n': 366 consume(); 367 delegate.atomPatternCharacter('\n'); 368 break; 369 case 'r': 370 consume(); 371 delegate.atomPatternCharacter('\r'); 372 break; 373 case 't': 374 consume(); 375 delegate.atomPatternCharacter('\t'); 376 break; 377 case 'v': 378 consume(); 379 delegate.atomPatternCharacter('\v'); 380 break; 381 382 // ControlLetter 383 case 'c': { 384 ParseState state = saveState(); 385 consume(); 386 if (!atEndOfPattern()) { 387 int control = consume(); 388 389 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. 390 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { 391 delegate.atomPatternCharacter(control & 0x1f); 392 break; 393 } 394 } 395 restoreState(state); 396 delegate.atomPatternCharacter('\\'); 397 break; 398 } 399 400 // HexEscape 401 case 'x': { 402 consume(); 403 int x = tryConsumeHex(2); 404 if (x == -1) 405 delegate.atomPatternCharacter('x'); 406 else 407 delegate.atomPatternCharacter(x); 408 break; 409 } 410 411 // UnicodeEscape 412 case 'u': { 413 consume(); 414 int u = tryConsumeHex(4); 415 if (u == -1) 416 delegate.atomPatternCharacter('u'); 417 else 418 delegate.atomPatternCharacter(u); 419 break; 420 } 421 422 // IdentityEscape 423 default: 424 delegate.atomPatternCharacter(consume()); 425 } 426 427 return true; 428 } 429 430 /* 431 * parseAtomEscape(), parseCharacterClassEscape(): 432 * 433 * These methods alias to parseEscape(). 434 */ 435 bool parseAtomEscape() 436 { 437 return parseEscape<false>(m_delegate); 438 } 439 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) 440 { 441 parseEscape<true>(delegate); 442 } 443 444 /* 445 * parseCharacterClass(): 446 * 447 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) 448 * to an instance of CharacterClassParserDelegate, to describe the character class to the 449 * delegate. 450 */ 451 void parseCharacterClass() 452 { 453 ASSERT(!m_err); 454 ASSERT(peek() == '['); 455 consume(); 456 457 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); 458 459 characterClassConstructor.begin(tryConsume('^')); 460 461 while (!atEndOfPattern()) { 462 switch (peek()) { 463 case ']': 464 consume(); 465 characterClassConstructor.end(); 466 return; 467 468 case '\\': 469 parseCharacterClassEscape(characterClassConstructor); 470 break; 471 472 default: 473 characterClassConstructor.atomPatternCharacter(consume(), true); 474 } 475 476 if (m_err) 477 return; 478 } 479 480 m_err = CharacterClassUnmatched; 481 } 482 483 /* 484 * parseParenthesesBegin(): 485 * 486 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. 487 */ 488 void parseParenthesesBegin() 489 { 490 ASSERT(!m_err); 491 ASSERT(peek() == '('); 492 consume(); 493 494 if (tryConsume('?')) { 495 if (atEndOfPattern()) { 496 m_err = ParenthesesTypeInvalid; 497 return; 498 } 499 500 switch (consume()) { 501 case ':': 502 m_delegate.atomParenthesesSubpatternBegin(false); 503 break; 504 505 case '=': 506 m_delegate.atomParentheticalAssertionBegin(); 507 break; 508 509 case '!': 510 m_delegate.atomParentheticalAssertionBegin(true); 511 break; 512 513 default: 514 m_err = ParenthesesTypeInvalid; 515 } 516 } else 517 m_delegate.atomParenthesesSubpatternBegin(); 518 519 ++m_parenthesesNestingDepth; 520 } 521 522 /* 523 * parseParenthesesEnd(): 524 * 525 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). 526 */ 527 void parseParenthesesEnd() 528 { 529 ASSERT(!m_err); 530 ASSERT(peek() == ')'); 531 consume(); 532 533 if (m_parenthesesNestingDepth > 0) 534 m_delegate.atomParenthesesEnd(); 535 else 536 m_err = ParenthesesUnmatched; 537 538 --m_parenthesesNestingDepth; 539 } 540 541 /* 542 * parseQuantifier(): 543 * 544 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. 545 */ 546 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) 547 { 548 ASSERT(!m_err); 549 ASSERT(min <= max); 550 551 if (min == UINT_MAX) { 552 m_err = QuantifierTooLarge; 553 return; 554 } 555 556 if (lastTokenWasAnAtom) 557 m_delegate.quantifyAtom(min, max, !tryConsume('?')); 558 else 559 m_err = QuantifierWithoutAtom; 560 } 561 562 /* 563 * parseTokens(): 564 * 565 * This method loops over the input pattern reporting tokens to the delegate. 566 * The method returns when a parse error is detected, or the end of the pattern 567 * is reached. One piece of state is tracked around the loop, which is whether 568 * the last token passed to the delegate was an atom (this is necessary to detect 569 * a parse error when a quantifier provided without an atom to quantify). 570 */ 571 void parseTokens() 572 { 573 bool lastTokenWasAnAtom = false; 574 575 while (!atEndOfPattern()) { 576 switch (peek()) { 577 case '|': 578 consume(); 579 m_delegate.disjunction(); 580 lastTokenWasAnAtom = false; 581 break; 582 583 case '(': 584 parseParenthesesBegin(); 585 lastTokenWasAnAtom = false; 586 break; 587 588 case ')': 589 parseParenthesesEnd(); 590 lastTokenWasAnAtom = true; 591 break; 592 593 case '^': 594 consume(); 595 m_delegate.assertionBOL(); 596 lastTokenWasAnAtom = false; 597 break; 598 599 case '$': 600 consume(); 601 m_delegate.assertionEOL(); 602 lastTokenWasAnAtom = false; 603 break; 604 605 case '.': 606 consume(); 607 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); 608 lastTokenWasAnAtom = true; 609 break; 610 611 case '[': 612 parseCharacterClass(); 613 lastTokenWasAnAtom = true; 614 break; 615 616 case '\\': 617 lastTokenWasAnAtom = parseAtomEscape(); 618 break; 619 620 case '*': 621 consume(); 622 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); 623 lastTokenWasAnAtom = false; 624 break; 625 626 case '+': 627 consume(); 628 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); 629 lastTokenWasAnAtom = false; 630 break; 631 632 case '?': 633 consume(); 634 parseQuantifier(lastTokenWasAnAtom, 0, 1); 635 lastTokenWasAnAtom = false; 636 break; 637 638 case '{': { 639 ParseState state = saveState(); 640 641 consume(); 642 if (peekIsDigit()) { 643 unsigned min = consumeNumber(); 644 unsigned max = min; 645 646 if (tryConsume(',')) 647 max = peekIsDigit() ? consumeNumber() : quantifyInfinite; 648 649 if (tryConsume('}')) { 650 if (min <= max) 651 parseQuantifier(lastTokenWasAnAtom, min, max); 652 else 653 m_err = QuantifierOutOfOrder; 654 lastTokenWasAnAtom = false; 655 break; 656 } 657 } 658 659 restoreState(state); 660 } 661 // if we did not find a complete quantifer, fall through to the default case. 662 FALLTHROUGH; 663 664 default: 665 m_delegate.atomPatternCharacter(consume()); 666 lastTokenWasAnAtom = true; 667 } 668 669 if (m_err) 670 return; 671 } 672 673 if (m_parenthesesNestingDepth > 0) 674 m_err = MissingParentheses; 675 } 676 677 /* 678 * parse(): 679 * 680 * This method calls parseTokens() to parse over the input and converts any 681 * error code to a const char* for a result. 682 */ 683 const char* parse() 684 { 685 if (m_size > MAX_PATTERN_SIZE) 686 m_err = PatternTooLarge; 687 else 688 parseTokens(); 689 ASSERT(atEndOfPattern() || m_err); 690 691 // The order of this array must match the ErrorCode enum. 692 static const char* errorMessages[NumberOfErrorCodes] = { 693 0, // NoError 694 REGEXP_ERROR_PREFIX "regular expression too large", 695 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", 696 REGEXP_ERROR_PREFIX "nothing to repeat", 697 REGEXP_ERROR_PREFIX "number too large in {} quantifier", 698 REGEXP_ERROR_PREFIX "missing )", 699 REGEXP_ERROR_PREFIX "unmatched parentheses", 700 REGEXP_ERROR_PREFIX "unrecognized character after (?", 701 REGEXP_ERROR_PREFIX "missing terminating ] for character class", 702 REGEXP_ERROR_PREFIX "range out of order in character class", 703 REGEXP_ERROR_PREFIX "\\ at end of pattern" 704 }; 705 706 return errorMessages[m_err]; 707 } 708 709 // Misc helper functions: 710 711 typedef unsigned ParseState; 712 713 ParseState saveState() 714 { 715 return m_index; 716 } 717 718 void restoreState(ParseState state) 719 { 720 m_index = state; 721 } 722 723 bool atEndOfPattern() 724 { 725 ASSERT(m_index <= m_size); 726 return m_index == m_size; 727 } 728 729 int peek() 730 { 731 ASSERT(m_index < m_size); 732 return m_data[m_index]; 733 } 734 735 bool peekIsDigit() 736 { 737 return !atEndOfPattern() && WTF::isASCIIDigit(peek()); 738 } 739 740 unsigned peekDigit() 741 { 742 ASSERT(peekIsDigit()); 743 return peek() - '0'; 744 } 745 746 int consume() 747 { 748 ASSERT(m_index < m_size); 749 return m_data[m_index++]; 750 } 751 752 unsigned consumeDigit() 753 { 754 ASSERT(peekIsDigit()); 755 return consume() - '0'; 756 } 757 758 unsigned consumeNumber() 759 { 760 unsigned n = consumeDigit(); 761 // check for overflow. 762 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { 763 n = newValue; 764 consume(); 765 } 766 return n; 767 } 768 769 unsigned consumeOctal() 770 { 771 ASSERT(WTF::isASCIIOctalDigit(peek())); 772 773 unsigned n = consumeDigit(); 774 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) 775 n = n * 8 + consumeDigit(); 776 return n; 777 } 778 779 bool tryConsume(UChar ch) 780 { 781 if (atEndOfPattern() || (m_data[m_index] != ch)) 782 return false; 783 ++m_index; 784 return true; 785 } 786 787 int tryConsumeHex(int count) 788 { 789 ParseState state = saveState(); 790 791 int n = 0; 792 while (count--) { 793 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { 794 restoreState(state); 795 return -1; 796 } 797 n = (n << 4) | WTF::toASCIIHexValue(consume()); 798 } 799 return n; 800 } 801 802 Delegate& m_delegate; 803 unsigned m_backReferenceLimit; 804 ErrorCode m_err; 805 const CharType* m_data; 806 unsigned m_size; 807 unsigned m_index; 808 unsigned m_parenthesesNestingDepth; 809 810 // Derived by empirical testing of compile time in PCRE and WREC. 811 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; 812}; 813 814/* 815 * Yarr::parse(): 816 * 817 * The parse method is passed a pattern to be parsed and a delegate upon which 818 * callbacks will be made to record the parsed tokens forming the regex. 819 * Yarr::parse() returns null on success, or a const C string providing an error 820 * message where a parse error occurs. 821 * 822 * The Delegate must implement the following interface: 823 * 824 * void assertionBOL(); 825 * void assertionEOL(); 826 * void assertionWordBoundary(bool invert); 827 * 828 * void atomPatternCharacter(UChar ch); 829 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); 830 * void atomCharacterClassBegin(bool invert) 831 * void atomCharacterClassAtom(UChar ch) 832 * void atomCharacterClassRange(UChar begin, UChar end) 833 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 834 * void atomCharacterClassEnd() 835 * void atomParenthesesSubpatternBegin(bool capture = true); 836 * void atomParentheticalAssertionBegin(bool invert = false); 837 * void atomParenthesesEnd(); 838 * void atomBackReference(unsigned subpatternId); 839 * 840 * void quantifyAtom(unsigned min, unsigned max, bool greedy); 841 * 842 * void disjunction(); 843 * 844 * The regular expression is described by a sequence of assertion*() and atom*() 845 * callbacks to the delegate, describing the terms in the regular expression. 846 * Following an atom a quantifyAtom() call may occur to indicate that the previous 847 * atom should be quantified. In the case of atoms described across multiple 848 * calls (parentheses and character classes) the call to quantifyAtom() will come 849 * after the call to the atom*End() method, never after atom*Begin(). 850 * 851 * Character classes may either be described by a single call to 852 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. 853 * In the latter case, ...Begin() will be called, followed by a sequence of 854 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). 855 * 856 * Sequences of atoms and assertions are broken into alternatives via calls to 857 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to 858 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. 859 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular 860 * capturing subpattern, this will be the subpatternId associated with these 861 * parentheses, and will also by definition be the lowest subpatternId of these 862 * parentheses and of any nested paretheses. The atomParenthesesEnd() method 863 * is passed the subpatternId of the last capturing subexpression nested within 864 * these paretheses. In the case of a capturing subpattern with no nested 865 * capturing subpatterns, the same subpatternId will be passed to the begin and 866 * end functions. In the case of non-capturing subpatterns the subpatternId 867 * passed to the begin method is also the first possible subpatternId that might 868 * be nested within these paretheses. If a set of non-capturing parentheses does 869 * not contain any capturing subpatterns, then the subpatternId passed to begin 870 * will be greater than the subpatternId passed to end. 871 */ 872 873template<class Delegate> 874const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) 875{ 876 if (pattern.is8Bit()) 877 return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); 878 return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); 879} 880 881} } // namespace JSC::Yarr 882 883#endif // YarrParser_h 884