1/* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "YarrInterpreter.h" 29 30#include "Yarr.h" 31#include "YarrCanonicalizeUCS2.h" 32#include <wtf/BumpPointerAllocator.h> 33#include <wtf/DataLog.h> 34#include <wtf/text/CString.h> 35#include <wtf/text/WTFString.h> 36 37using namespace WTF; 38 39namespace JSC { namespace Yarr { 40 41template<typename CharType> 42class Interpreter { 43public: 44 struct ParenthesesDisjunctionContext; 45 46 struct BackTrackInfoPatternCharacter { 47 uintptr_t matchAmount; 48 }; 49 struct BackTrackInfoCharacterClass { 50 uintptr_t matchAmount; 51 }; 52 struct BackTrackInfoBackReference { 53 uintptr_t begin; // Not really needed for greedy quantifiers. 54 uintptr_t matchAmount; // Not really needed for fixed quantifiers. 55 }; 56 struct BackTrackInfoAlternative { 57 uintptr_t offset; 58 }; 59 struct BackTrackInfoParentheticalAssertion { 60 uintptr_t begin; 61 }; 62 struct BackTrackInfoParenthesesOnce { 63 uintptr_t begin; 64 }; 65 struct BackTrackInfoParenthesesTerminal { 66 uintptr_t begin; 67 }; 68 struct BackTrackInfoParentheses { 69 uintptr_t matchAmount; 70 ParenthesesDisjunctionContext* lastContext; 71 }; 72 73 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) 74 { 75 context->next = backTrack->lastContext; 76 backTrack->lastContext = context; 77 ++backTrack->matchAmount; 78 } 79 80 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) 81 { 82 RELEASE_ASSERT(backTrack->matchAmount); 83 RELEASE_ASSERT(backTrack->lastContext); 84 backTrack->lastContext = backTrack->lastContext->next; 85 --backTrack->matchAmount; 86 } 87 88 struct DisjunctionContext 89 { 90 DisjunctionContext() 91 : term(0) 92 { 93 } 94 95 void* operator new(size_t, void* where) 96 { 97 return where; 98 } 99 100 int term; 101 unsigned matchBegin; 102 unsigned matchEnd; 103 uintptr_t frame[1]; 104 }; 105 106 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) 107 { 108 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); 109 allocatorPool = allocatorPool->ensureCapacity(size); 110 RELEASE_ASSERT(allocatorPool); 111 return new (allocatorPool->alloc(size)) DisjunctionContext(); 112 } 113 114 void freeDisjunctionContext(DisjunctionContext* context) 115 { 116 allocatorPool = allocatorPool->dealloc(context); 117 } 118 119 struct ParenthesesDisjunctionContext 120 { 121 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) 122 : next(0) 123 { 124 unsigned firstSubpatternId = term.atom.subpatternId; 125 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; 126 127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { 128 subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; 129 output[(firstSubpatternId << 1) + i] = offsetNoMatch; 130 } 131 132 new (getDisjunctionContext(term)) DisjunctionContext(); 133 } 134 135 void* operator new(size_t, void* where) 136 { 137 return where; 138 } 139 140 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) 141 { 142 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) 143 output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; 144 } 145 146 DisjunctionContext* getDisjunctionContext(ByteTerm& term) 147 { 148 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); 149 } 150 151 ParenthesesDisjunctionContext* next; 152 unsigned subpatternBackup[1]; 153 }; 154 155 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) 156 { 157 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); 158 allocatorPool = allocatorPool->ensureCapacity(size); 159 RELEASE_ASSERT(allocatorPool); 160 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); 161 } 162 163 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) 164 { 165 allocatorPool = allocatorPool->dealloc(context); 166 } 167 168 class InputStream { 169 public: 170 InputStream(const CharType* input, unsigned start, unsigned length) 171 : input(input) 172 , pos(start) 173 , length(length) 174 { 175 } 176 177 void next() 178 { 179 ++pos; 180 } 181 182 void rewind(unsigned amount) 183 { 184 ASSERT(pos >= amount); 185 pos -= amount; 186 } 187 188 int read() 189 { 190 ASSERT(pos < length); 191 if (pos < length) 192 return input[pos]; 193 return -1; 194 } 195 196 int readPair() 197 { 198 ASSERT(pos + 1 < length); 199 return input[pos] | input[pos + 1] << 16; 200 } 201 202 int readChecked(unsigned negativePositionOffest) 203 { 204 RELEASE_ASSERT(pos >= negativePositionOffest); 205 unsigned p = pos - negativePositionOffest; 206 ASSERT(p < length); 207 return input[p]; 208 } 209 210 int reread(unsigned from) 211 { 212 ASSERT(from < length); 213 return input[from]; 214 } 215 216 int prev() 217 { 218 ASSERT(!(pos > length)); 219 if (pos && length) 220 return input[pos - 1]; 221 return -1; 222 } 223 224 unsigned getPos() 225 { 226 return pos; 227 } 228 229 void setPos(unsigned p) 230 { 231 pos = p; 232 } 233 234 bool atStart() 235 { 236 return pos == 0; 237 } 238 239 bool atEnd() 240 { 241 return pos == length; 242 } 243 244 unsigned end() 245 { 246 return length; 247 } 248 249 bool checkInput(unsigned count) 250 { 251 if (((pos + count) <= length) && ((pos + count) >= pos)) { 252 pos += count; 253 return true; 254 } 255 return false; 256 } 257 258 void uncheckInput(unsigned count) 259 { 260 RELEASE_ASSERT(pos >= count); 261 pos -= count; 262 } 263 264 bool atStart(unsigned negativePositionOffest) 265 { 266 return pos == negativePositionOffest; 267 } 268 269 bool atEnd(unsigned negativePositionOffest) 270 { 271 RELEASE_ASSERT(pos >= negativePositionOffest); 272 return (pos - negativePositionOffest) == length; 273 } 274 275 bool isAvailableInput(unsigned offset) 276 { 277 return (((pos + offset) <= length) && ((pos + offset) >= pos)); 278 } 279 280 private: 281 const CharType* input; 282 unsigned pos; 283 unsigned length; 284 }; 285 286 bool testCharacterClass(CharacterClass* characterClass, int ch) 287 { 288 if (ch & 0xFF80) { 289 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) 290 if (ch == characterClass->m_matchesUnicode[i]) 291 return true; 292 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) 293 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) 294 return true; 295 } else { 296 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) 297 if (ch == characterClass->m_matches[i]) 298 return true; 299 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) 300 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) 301 return true; 302 } 303 304 return false; 305 } 306 307 bool checkCharacter(int testChar, unsigned negativeInputOffset) 308 { 309 return testChar == input.readChecked(negativeInputOffset); 310 } 311 312 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) 313 { 314 int ch = input.readChecked(negativeInputOffset); 315 return (loChar == ch) || (hiChar == ch); 316 } 317 318 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) 319 { 320 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset)); 321 return invert ? !match : match; 322 } 323 324 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset) 325 { 326 unsigned matchSize = (unsigned)(matchEnd - matchBegin); 327 328 if (!input.checkInput(matchSize)) 329 return false; 330 331 if (pattern->m_ignoreCase) { 332 for (unsigned i = 0; i < matchSize; ++i) { 333 int oldCh = input.reread(matchBegin + i); 334 int ch = input.readChecked(negativeInputOffset + matchSize - i); 335 336 if (oldCh == ch) 337 continue; 338 339 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that 340 // unicode values are never allowed to match against ascii ones. 341 if (isASCII(oldCh) || isASCII(ch)) { 342 if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) 343 continue; 344 } else if (areCanonicallyEquivalent(oldCh, ch)) 345 continue; 346 347 input.uncheckInput(matchSize); 348 return false; 349 } 350 } else { 351 for (unsigned i = 0; i < matchSize; ++i) { 352 if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { 353 input.uncheckInput(matchSize); 354 return false; 355 } 356 } 357 } 358 359 return true; 360 } 361 362 bool matchAssertionBOL(ByteTerm& term) 363 { 364 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); 365 } 366 367 bool matchAssertionEOL(ByteTerm& term) 368 { 369 if (term.inputPosition) 370 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); 371 372 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); 373 } 374 375 bool matchAssertionWordBoundary(ByteTerm& term) 376 { 377 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1)); 378 bool readIsWordchar; 379 if (term.inputPosition) 380 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); 381 else 382 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); 383 384 bool wordBoundary = prevIsWordchar != readIsWordchar; 385 return term.invert() ? !wordBoundary : wordBoundary; 386 } 387 388 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) 389 { 390 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 391 392 switch (term.atom.quantityType) { 393 case QuantifierFixedCount: 394 break; 395 396 case QuantifierGreedy: 397 if (backTrack->matchAmount) { 398 --backTrack->matchAmount; 399 input.uncheckInput(1); 400 return true; 401 } 402 break; 403 404 case QuantifierNonGreedy: 405 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 406 ++backTrack->matchAmount; 407 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) 408 return true; 409 } 410 input.uncheckInput(backTrack->matchAmount); 411 break; 412 } 413 414 return false; 415 } 416 417 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) 418 { 419 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 420 421 switch (term.atom.quantityType) { 422 case QuantifierFixedCount: 423 break; 424 425 case QuantifierGreedy: 426 if (backTrack->matchAmount) { 427 --backTrack->matchAmount; 428 input.uncheckInput(1); 429 return true; 430 } 431 break; 432 433 case QuantifierNonGreedy: 434 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 435 ++backTrack->matchAmount; 436 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) 437 return true; 438 } 439 input.uncheckInput(backTrack->matchAmount); 440 break; 441 } 442 443 return false; 444 } 445 446 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) 447 { 448 ASSERT(term.type == ByteTerm::TypeCharacterClass); 449 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 450 451 switch (term.atom.quantityType) { 452 case QuantifierFixedCount: { 453 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 454 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) 455 return false; 456 } 457 return true; 458 } 459 460 case QuantifierGreedy: { 461 unsigned matchAmount = 0; 462 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 463 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { 464 input.uncheckInput(1); 465 break; 466 } 467 ++matchAmount; 468 } 469 backTrack->matchAmount = matchAmount; 470 471 return true; 472 } 473 474 case QuantifierNonGreedy: 475 backTrack->matchAmount = 0; 476 return true; 477 } 478 479 RELEASE_ASSERT_NOT_REACHED(); 480 return false; 481 } 482 483 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) 484 { 485 ASSERT(term.type == ByteTerm::TypeCharacterClass); 486 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 487 488 switch (term.atom.quantityType) { 489 case QuantifierFixedCount: 490 break; 491 492 case QuantifierGreedy: 493 if (backTrack->matchAmount) { 494 --backTrack->matchAmount; 495 input.uncheckInput(1); 496 return true; 497 } 498 break; 499 500 case QuantifierNonGreedy: 501 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 502 ++backTrack->matchAmount; 503 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) 504 return true; 505 } 506 input.uncheckInput(backTrack->matchAmount); 507 break; 508 } 509 510 return false; 511 } 512 513 bool matchBackReference(ByteTerm& term, DisjunctionContext* context) 514 { 515 ASSERT(term.type == ByteTerm::TypeBackReference); 516 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 517 518 unsigned matchBegin = output[(term.atom.subpatternId << 1)]; 519 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; 520 521 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. 522 // In this case the result of match is empty string like when it references to a parentheses with zero-width match. 523 // Eg.: /(a\1)/ 524 if (matchEnd == offsetNoMatch) 525 return true; 526 527 if (matchBegin == offsetNoMatch) 528 return true; 529 530 ASSERT(matchBegin <= matchEnd); 531 532 if (matchBegin == matchEnd) 533 return true; 534 535 switch (term.atom.quantityType) { 536 case QuantifierFixedCount: { 537 backTrack->begin = input.getPos(); 538 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 539 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 540 input.setPos(backTrack->begin); 541 return false; 542 } 543 } 544 return true; 545 } 546 547 case QuantifierGreedy: { 548 unsigned matchAmount = 0; 549 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) 550 ++matchAmount; 551 backTrack->matchAmount = matchAmount; 552 return true; 553 } 554 555 case QuantifierNonGreedy: 556 backTrack->begin = input.getPos(); 557 backTrack->matchAmount = 0; 558 return true; 559 } 560 561 RELEASE_ASSERT_NOT_REACHED(); 562 return false; 563 } 564 565 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) 566 { 567 ASSERT(term.type == ByteTerm::TypeBackReference); 568 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 569 570 unsigned matchBegin = output[(term.atom.subpatternId << 1)]; 571 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; 572 573 if (matchBegin == offsetNoMatch) 574 return false; 575 576 ASSERT(matchBegin <= matchEnd); 577 578 if (matchBegin == matchEnd) 579 return false; 580 581 switch (term.atom.quantityType) { 582 case QuantifierFixedCount: 583 // for quantityCount == 1, could rewind. 584 input.setPos(backTrack->begin); 585 break; 586 587 case QuantifierGreedy: 588 if (backTrack->matchAmount) { 589 --backTrack->matchAmount; 590 input.rewind(matchEnd - matchBegin); 591 return true; 592 } 593 break; 594 595 case QuantifierNonGreedy: 596 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 597 ++backTrack->matchAmount; 598 return true; 599 } 600 input.setPos(backTrack->begin); 601 break; 602 } 603 604 return false; 605 } 606 607 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) 608 { 609 if (term.capture()) { 610 unsigned subpatternId = term.atom.subpatternId; 611 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; 612 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; 613 } 614 } 615 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) 616 { 617 unsigned firstSubpatternId = term.atom.subpatternId; 618 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; 619 context->restoreOutput(output, firstSubpatternId, count); 620 } 621 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) 622 { 623 while (backTrack->matchAmount) { 624 ParenthesesDisjunctionContext* context = backTrack->lastContext; 625 626 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); 627 if (result == JSRegExpMatch) 628 return JSRegExpMatch; 629 630 resetMatches(term, context); 631 popParenthesesDisjunctionContext(backTrack); 632 freeParenthesesDisjunctionContext(context); 633 634 if (result != JSRegExpNoMatch) 635 return result; 636 } 637 638 return JSRegExpNoMatch; 639 } 640 641 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 642 { 643 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 644 ASSERT(term.atom.quantityCount == 1); 645 646 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 647 648 switch (term.atom.quantityType) { 649 case QuantifierGreedy: { 650 // set this speculatively; if we get to the parens end this will be true. 651 backTrack->begin = input.getPos(); 652 break; 653 } 654 case QuantifierNonGreedy: { 655 backTrack->begin = notFound; 656 context->term += term.atom.parenthesesWidth; 657 return true; 658 } 659 case QuantifierFixedCount: 660 break; 661 } 662 663 if (term.capture()) { 664 unsigned subpatternId = term.atom.subpatternId; 665 output[(subpatternId << 1)] = input.getPos() - term.inputPosition; 666 } 667 668 return true; 669 } 670 671 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) 672 { 673 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 674 ASSERT(term.atom.quantityCount == 1); 675 676 if (term.capture()) { 677 unsigned subpatternId = term.atom.subpatternId; 678 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; 679 } 680 681 if (term.atom.quantityType == QuantifierFixedCount) 682 return true; 683 684 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 685 return backTrack->begin != input.getPos(); 686 } 687 688 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 689 { 690 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 691 ASSERT(term.atom.quantityCount == 1); 692 693 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 694 695 if (term.capture()) { 696 unsigned subpatternId = term.atom.subpatternId; 697 output[(subpatternId << 1)] = offsetNoMatch; 698 output[(subpatternId << 1) + 1] = offsetNoMatch; 699 } 700 701 switch (term.atom.quantityType) { 702 case QuantifierGreedy: 703 // if we backtrack to this point, there is another chance - try matching nothing. 704 ASSERT(backTrack->begin != notFound); 705 backTrack->begin = notFound; 706 context->term += term.atom.parenthesesWidth; 707 return true; 708 case QuantifierNonGreedy: 709 ASSERT(backTrack->begin != notFound); 710 FALLTHROUGH; 711 case QuantifierFixedCount: 712 break; 713 } 714 715 return false; 716 } 717 718 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) 719 { 720 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 721 ASSERT(term.atom.quantityCount == 1); 722 723 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 724 725 switch (term.atom.quantityType) { 726 case QuantifierGreedy: 727 if (backTrack->begin == notFound) { 728 context->term -= term.atom.parenthesesWidth; 729 return false; 730 } 731 FALLTHROUGH; 732 case QuantifierNonGreedy: 733 if (backTrack->begin == notFound) { 734 backTrack->begin = input.getPos(); 735 if (term.capture()) { 736 // Technically this access to inputPosition should be accessing the begin term's 737 // inputPosition, but for repeats other than fixed these values should be 738 // the same anyway! (We don't pre-check for greedy or non-greedy matches.) 739 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 740 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); 741 unsigned subpatternId = term.atom.subpatternId; 742 output[subpatternId << 1] = input.getPos() + term.inputPosition; 743 } 744 context->term -= term.atom.parenthesesWidth; 745 return true; 746 } 747 FALLTHROUGH; 748 case QuantifierFixedCount: 749 break; 750 } 751 752 return false; 753 } 754 755 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) 756 { 757 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 758 ASSERT(term.atom.quantityType == QuantifierGreedy); 759 ASSERT(term.atom.quantityCount == quantifyInfinite); 760 ASSERT(!term.capture()); 761 762 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); 763 backTrack->begin = input.getPos(); 764 return true; 765 } 766 767 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) 768 { 769 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); 770 771 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); 772 // Empty match is a failed match. 773 if (backTrack->begin == input.getPos()) 774 return false; 775 776 // Successful match! Okay, what's next? - loop around and try to match moar! 777 context->term -= (term.atom.parenthesesWidth + 1); 778 return true; 779 } 780 781 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) 782 { 783 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 784 ASSERT(term.atom.quantityType == QuantifierGreedy); 785 ASSERT(term.atom.quantityCount == quantifyInfinite); 786 ASSERT(!term.capture()); 787 788 // If we backtrack to this point, we have failed to match this iteration of the parens. 789 // Since this is greedy / zero minimum a failed is also accepted as a match! 790 context->term += term.atom.parenthesesWidth; 791 return true; 792 } 793 794 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) 795 { 796 // 'Terminal' parentheses are at the end of the regex, and as such a match past end 797 // should always be returned as a successful match - we should never backtrack to here. 798 RELEASE_ASSERT_NOT_REACHED(); 799 return false; 800 } 801 802 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 803 { 804 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 805 ASSERT(term.atom.quantityCount == 1); 806 807 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 808 809 backTrack->begin = input.getPos(); 810 return true; 811 } 812 813 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 814 { 815 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 816 ASSERT(term.atom.quantityCount == 1); 817 818 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 819 820 input.setPos(backTrack->begin); 821 822 // We've reached the end of the parens; if they are inverted, this is failure. 823 if (term.invert()) { 824 context->term -= term.atom.parenthesesWidth; 825 return false; 826 } 827 828 return true; 829 } 830 831 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 832 { 833 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 834 ASSERT(term.atom.quantityCount == 1); 835 836 // We've failed to match parens; if they are inverted, this is win! 837 if (term.invert()) { 838 context->term += term.atom.parenthesesWidth; 839 return true; 840 } 841 842 return false; 843 } 844 845 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 846 { 847 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 848 ASSERT(term.atom.quantityCount == 1); 849 850 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 851 852 input.setPos(backTrack->begin); 853 854 context->term -= term.atom.parenthesesWidth; 855 return false; 856 } 857 858 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) 859 { 860 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 861 862 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 863 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 864 865 backTrack->matchAmount = 0; 866 backTrack->lastContext = 0; 867 868 switch (term.atom.quantityType) { 869 case QuantifierFixedCount: { 870 // While we haven't yet reached our fixed limit, 871 while (backTrack->matchAmount < term.atom.quantityCount) { 872 // Try to do a match, and it it succeeds, add it to the list. 873 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 874 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 875 if (result == JSRegExpMatch) 876 appendParenthesesDisjunctionContext(backTrack, context); 877 else { 878 // The match failed; try to find an alternate point to carry on from. 879 resetMatches(term, context); 880 freeParenthesesDisjunctionContext(context); 881 882 if (result != JSRegExpNoMatch) 883 return result; 884 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); 885 if (backtrackResult != JSRegExpMatch) 886 return backtrackResult; 887 } 888 } 889 890 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 891 ParenthesesDisjunctionContext* context = backTrack->lastContext; 892 recordParenthesesMatch(term, context); 893 return JSRegExpMatch; 894 } 895 896 case QuantifierGreedy: { 897 while (backTrack->matchAmount < term.atom.quantityCount) { 898 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 899 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 900 if (result == JSRegExpMatch) 901 appendParenthesesDisjunctionContext(backTrack, context); 902 else { 903 resetMatches(term, context); 904 freeParenthesesDisjunctionContext(context); 905 906 if (result != JSRegExpNoMatch) 907 return result; 908 909 break; 910 } 911 } 912 913 if (backTrack->matchAmount) { 914 ParenthesesDisjunctionContext* context = backTrack->lastContext; 915 recordParenthesesMatch(term, context); 916 } 917 return JSRegExpMatch; 918 } 919 920 case QuantifierNonGreedy: 921 return JSRegExpMatch; 922 } 923 924 RELEASE_ASSERT_NOT_REACHED(); 925 return JSRegExpErrorNoMatch; 926 } 927 928 // Rules for backtracking differ depending on whether this is greedy or non-greedy. 929 // 930 // Greedy matches never should try just adding more - you should already have done 931 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where 932 // you backtrack an item off the list needs checking, since we'll never have matched 933 // the one less case. Tracking forwards, still add as much as possible. 934 // 935 // Non-greedy, we've already done the one less case, so don't match on popping. 936 // We haven't done the one more case, so always try to add that. 937 // 938 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) 939 { 940 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 941 942 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 943 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 944 945 switch (term.atom.quantityType) { 946 case QuantifierFixedCount: { 947 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 948 949 ParenthesesDisjunctionContext* context = 0; 950 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); 951 952 if (result != JSRegExpMatch) 953 return result; 954 955 // While we haven't yet reached our fixed limit, 956 while (backTrack->matchAmount < term.atom.quantityCount) { 957 // Try to do a match, and it it succeeds, add it to the list. 958 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 959 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 960 961 if (result == JSRegExpMatch) 962 appendParenthesesDisjunctionContext(backTrack, context); 963 else { 964 // The match failed; try to find an alternate point to carry on from. 965 resetMatches(term, context); 966 freeParenthesesDisjunctionContext(context); 967 968 if (result != JSRegExpNoMatch) 969 return result; 970 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); 971 if (backtrackResult != JSRegExpMatch) 972 return backtrackResult; 973 } 974 } 975 976 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 977 context = backTrack->lastContext; 978 recordParenthesesMatch(term, context); 979 return JSRegExpMatch; 980 } 981 982 case QuantifierGreedy: { 983 if (!backTrack->matchAmount) 984 return JSRegExpNoMatch; 985 986 ParenthesesDisjunctionContext* context = backTrack->lastContext; 987 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); 988 if (result == JSRegExpMatch) { 989 while (backTrack->matchAmount < term.atom.quantityCount) { 990 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 991 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 992 if (parenthesesResult == JSRegExpMatch) 993 appendParenthesesDisjunctionContext(backTrack, context); 994 else { 995 resetMatches(term, context); 996 freeParenthesesDisjunctionContext(context); 997 998 if (parenthesesResult != JSRegExpNoMatch) 999 return parenthesesResult; 1000 1001 break; 1002 } 1003 } 1004 } else { 1005 resetMatches(term, context); 1006 popParenthesesDisjunctionContext(backTrack); 1007 freeParenthesesDisjunctionContext(context); 1008 1009 if (result != JSRegExpNoMatch) 1010 return result; 1011 } 1012 1013 if (backTrack->matchAmount) { 1014 ParenthesesDisjunctionContext* context = backTrack->lastContext; 1015 recordParenthesesMatch(term, context); 1016 } 1017 return JSRegExpMatch; 1018 } 1019 1020 case QuantifierNonGreedy: { 1021 // If we've not reached the limit, try to add one more match. 1022 if (backTrack->matchAmount < term.atom.quantityCount) { 1023 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 1024 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 1025 if (result == JSRegExpMatch) { 1026 appendParenthesesDisjunctionContext(backTrack, context); 1027 recordParenthesesMatch(term, context); 1028 return JSRegExpMatch; 1029 } 1030 1031 resetMatches(term, context); 1032 freeParenthesesDisjunctionContext(context); 1033 1034 if (result != JSRegExpNoMatch) 1035 return result; 1036 } 1037 1038 // Nope - okay backtrack looking for an alternative. 1039 while (backTrack->matchAmount) { 1040 ParenthesesDisjunctionContext* context = backTrack->lastContext; 1041 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); 1042 if (result == JSRegExpMatch) { 1043 // successful backtrack! we're back in the game! 1044 if (backTrack->matchAmount) { 1045 context = backTrack->lastContext; 1046 recordParenthesesMatch(term, context); 1047 } 1048 return JSRegExpMatch; 1049 } 1050 1051 // pop a match off the stack 1052 resetMatches(term, context); 1053 popParenthesesDisjunctionContext(backTrack); 1054 freeParenthesesDisjunctionContext(context); 1055 1056 if (result != JSRegExpNoMatch) 1057 return result; 1058 } 1059 1060 return JSRegExpNoMatch; 1061 } 1062 } 1063 1064 RELEASE_ASSERT_NOT_REACHED(); 1065 return JSRegExpErrorNoMatch; 1066 } 1067 1068 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) 1069 { 1070 UNUSED_PARAM(term); 1071 unsigned matchBegin = context->matchBegin; 1072 1073 if (matchBegin) { 1074 for (matchBegin--; true; matchBegin--) { 1075 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { 1076 ++matchBegin; 1077 break; 1078 } 1079 1080 if (!matchBegin) 1081 break; 1082 } 1083 } 1084 1085 unsigned matchEnd = input.getPos(); 1086 1087 for (; (matchEnd != input.end()) 1088 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } 1089 1090 if (((matchBegin && term.anchors.m_bol) 1091 || ((matchEnd != input.end()) && term.anchors.m_eol)) 1092 && !pattern->m_multiline) 1093 return false; 1094 1095 context->matchBegin = matchBegin; 1096 context->matchEnd = matchEnd; 1097 return true; 1098 } 1099 1100#define MATCH_NEXT() { ++context->term; goto matchAgain; } 1101#define BACKTRACK() { --context->term; goto backtrack; } 1102#define currentTerm() (disjunction->terms[context->term]) 1103 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) 1104 { 1105 if (!--remainingMatchCount) 1106 return JSRegExpErrorHitLimit; 1107 1108 if (btrack) 1109 BACKTRACK(); 1110 1111 context->matchBegin = input.getPos(); 1112 context->term = 0; 1113 1114 matchAgain: 1115 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 1116 1117 switch (currentTerm().type) { 1118 case ByteTerm::TypeSubpatternBegin: 1119 MATCH_NEXT(); 1120 case ByteTerm::TypeSubpatternEnd: 1121 context->matchEnd = input.getPos(); 1122 return JSRegExpMatch; 1123 1124 case ByteTerm::TypeBodyAlternativeBegin: 1125 MATCH_NEXT(); 1126 case ByteTerm::TypeBodyAlternativeDisjunction: 1127 case ByteTerm::TypeBodyAlternativeEnd: 1128 context->matchEnd = input.getPos(); 1129 return JSRegExpMatch; 1130 1131 case ByteTerm::TypeAlternativeBegin: 1132 MATCH_NEXT(); 1133 case ByteTerm::TypeAlternativeDisjunction: 1134 case ByteTerm::TypeAlternativeEnd: { 1135 int offset = currentTerm().alternative.end; 1136 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 1137 backTrack->offset = offset; 1138 context->term += offset; 1139 MATCH_NEXT(); 1140 } 1141 1142 case ByteTerm::TypeAssertionBOL: 1143 if (matchAssertionBOL(currentTerm())) 1144 MATCH_NEXT(); 1145 BACKTRACK(); 1146 case ByteTerm::TypeAssertionEOL: 1147 if (matchAssertionEOL(currentTerm())) 1148 MATCH_NEXT(); 1149 BACKTRACK(); 1150 case ByteTerm::TypeAssertionWordBoundary: 1151 if (matchAssertionWordBoundary(currentTerm())) 1152 MATCH_NEXT(); 1153 BACKTRACK(); 1154 1155 case ByteTerm::TypePatternCharacterOnce: 1156 case ByteTerm::TypePatternCharacterFixed: { 1157 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1158 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) 1159 BACKTRACK(); 1160 } 1161 MATCH_NEXT(); 1162 } 1163 case ByteTerm::TypePatternCharacterGreedy: { 1164 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1165 unsigned matchAmount = 0; 1166 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1167 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { 1168 input.uncheckInput(1); 1169 break; 1170 } 1171 ++matchAmount; 1172 } 1173 backTrack->matchAmount = matchAmount; 1174 1175 MATCH_NEXT(); 1176 } 1177 case ByteTerm::TypePatternCharacterNonGreedy: { 1178 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1179 backTrack->matchAmount = 0; 1180 MATCH_NEXT(); 1181 } 1182 1183 case ByteTerm::TypePatternCasedCharacterOnce: 1184 case ByteTerm::TypePatternCasedCharacterFixed: { 1185 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1186 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) 1187 BACKTRACK(); 1188 } 1189 MATCH_NEXT(); 1190 } 1191 case ByteTerm::TypePatternCasedCharacterGreedy: { 1192 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1193 unsigned matchAmount = 0; 1194 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1195 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { 1196 input.uncheckInput(1); 1197 break; 1198 } 1199 ++matchAmount; 1200 } 1201 backTrack->matchAmount = matchAmount; 1202 1203 MATCH_NEXT(); 1204 } 1205 case ByteTerm::TypePatternCasedCharacterNonGreedy: { 1206 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1207 backTrack->matchAmount = 0; 1208 MATCH_NEXT(); 1209 } 1210 1211 case ByteTerm::TypeCharacterClass: 1212 if (matchCharacterClass(currentTerm(), context)) 1213 MATCH_NEXT(); 1214 BACKTRACK(); 1215 case ByteTerm::TypeBackReference: 1216 if (matchBackReference(currentTerm(), context)) 1217 MATCH_NEXT(); 1218 BACKTRACK(); 1219 case ByteTerm::TypeParenthesesSubpattern: { 1220 JSRegExpResult result = matchParentheses(currentTerm(), context); 1221 1222 if (result == JSRegExpMatch) { 1223 MATCH_NEXT(); 1224 } else if (result != JSRegExpNoMatch) 1225 return result; 1226 1227 BACKTRACK(); 1228 } 1229 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1230 if (matchParenthesesOnceBegin(currentTerm(), context)) 1231 MATCH_NEXT(); 1232 BACKTRACK(); 1233 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1234 if (matchParenthesesOnceEnd(currentTerm(), context)) 1235 MATCH_NEXT(); 1236 BACKTRACK(); 1237 case ByteTerm::TypeParenthesesSubpatternTerminalBegin: 1238 if (matchParenthesesTerminalBegin(currentTerm(), context)) 1239 MATCH_NEXT(); 1240 BACKTRACK(); 1241 case ByteTerm::TypeParenthesesSubpatternTerminalEnd: 1242 if (matchParenthesesTerminalEnd(currentTerm(), context)) 1243 MATCH_NEXT(); 1244 BACKTRACK(); 1245 case ByteTerm::TypeParentheticalAssertionBegin: 1246 if (matchParentheticalAssertionBegin(currentTerm(), context)) 1247 MATCH_NEXT(); 1248 BACKTRACK(); 1249 case ByteTerm::TypeParentheticalAssertionEnd: 1250 if (matchParentheticalAssertionEnd(currentTerm(), context)) 1251 MATCH_NEXT(); 1252 BACKTRACK(); 1253 1254 case ByteTerm::TypeCheckInput: 1255 if (input.checkInput(currentTerm().checkInputCount)) 1256 MATCH_NEXT(); 1257 BACKTRACK(); 1258 1259 case ByteTerm::TypeUncheckInput: 1260 input.uncheckInput(currentTerm().checkInputCount); 1261 MATCH_NEXT(); 1262 1263 case ByteTerm::TypeDotStarEnclosure: 1264 if (matchDotStarEnclosure(currentTerm(), context)) 1265 return JSRegExpMatch; 1266 BACKTRACK(); 1267 } 1268 1269 // We should never fall-through to here. 1270 RELEASE_ASSERT_NOT_REACHED(); 1271 1272 backtrack: 1273 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 1274 1275 switch (currentTerm().type) { 1276 case ByteTerm::TypeSubpatternBegin: 1277 return JSRegExpNoMatch; 1278 case ByteTerm::TypeSubpatternEnd: 1279 RELEASE_ASSERT_NOT_REACHED(); 1280 1281 case ByteTerm::TypeBodyAlternativeBegin: 1282 case ByteTerm::TypeBodyAlternativeDisjunction: { 1283 int offset = currentTerm().alternative.next; 1284 context->term += offset; 1285 if (offset > 0) 1286 MATCH_NEXT(); 1287 1288 if (input.atEnd()) 1289 return JSRegExpNoMatch; 1290 1291 input.next(); 1292 1293 context->matchBegin = input.getPos(); 1294 1295 if (currentTerm().alternative.onceThrough) 1296 context->term += currentTerm().alternative.next; 1297 1298 MATCH_NEXT(); 1299 } 1300 case ByteTerm::TypeBodyAlternativeEnd: 1301 RELEASE_ASSERT_NOT_REACHED(); 1302 1303 case ByteTerm::TypeAlternativeBegin: 1304 case ByteTerm::TypeAlternativeDisjunction: { 1305 int offset = currentTerm().alternative.next; 1306 context->term += offset; 1307 if (offset > 0) 1308 MATCH_NEXT(); 1309 BACKTRACK(); 1310 } 1311 case ByteTerm::TypeAlternativeEnd: { 1312 // We should never backtrack back into an alternative of the main body of the regex. 1313 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 1314 unsigned offset = backTrack->offset; 1315 context->term -= offset; 1316 BACKTRACK(); 1317 } 1318 1319 case ByteTerm::TypeAssertionBOL: 1320 case ByteTerm::TypeAssertionEOL: 1321 case ByteTerm::TypeAssertionWordBoundary: 1322 BACKTRACK(); 1323 1324 case ByteTerm::TypePatternCharacterOnce: 1325 case ByteTerm::TypePatternCharacterFixed: 1326 case ByteTerm::TypePatternCharacterGreedy: 1327 case ByteTerm::TypePatternCharacterNonGreedy: 1328 if (backtrackPatternCharacter(currentTerm(), context)) 1329 MATCH_NEXT(); 1330 BACKTRACK(); 1331 case ByteTerm::TypePatternCasedCharacterOnce: 1332 case ByteTerm::TypePatternCasedCharacterFixed: 1333 case ByteTerm::TypePatternCasedCharacterGreedy: 1334 case ByteTerm::TypePatternCasedCharacterNonGreedy: 1335 if (backtrackPatternCasedCharacter(currentTerm(), context)) 1336 MATCH_NEXT(); 1337 BACKTRACK(); 1338 case ByteTerm::TypeCharacterClass: 1339 if (backtrackCharacterClass(currentTerm(), context)) 1340 MATCH_NEXT(); 1341 BACKTRACK(); 1342 case ByteTerm::TypeBackReference: 1343 if (backtrackBackReference(currentTerm(), context)) 1344 MATCH_NEXT(); 1345 BACKTRACK(); 1346 case ByteTerm::TypeParenthesesSubpattern: { 1347 JSRegExpResult result = backtrackParentheses(currentTerm(), context); 1348 1349 if (result == JSRegExpMatch) { 1350 MATCH_NEXT(); 1351 } else if (result != JSRegExpNoMatch) 1352 return result; 1353 1354 BACKTRACK(); 1355 } 1356 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1357 if (backtrackParenthesesOnceBegin(currentTerm(), context)) 1358 MATCH_NEXT(); 1359 BACKTRACK(); 1360 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1361 if (backtrackParenthesesOnceEnd(currentTerm(), context)) 1362 MATCH_NEXT(); 1363 BACKTRACK(); 1364 case ByteTerm::TypeParenthesesSubpatternTerminalBegin: 1365 if (backtrackParenthesesTerminalBegin(currentTerm(), context)) 1366 MATCH_NEXT(); 1367 BACKTRACK(); 1368 case ByteTerm::TypeParenthesesSubpatternTerminalEnd: 1369 if (backtrackParenthesesTerminalEnd(currentTerm(), context)) 1370 MATCH_NEXT(); 1371 BACKTRACK(); 1372 case ByteTerm::TypeParentheticalAssertionBegin: 1373 if (backtrackParentheticalAssertionBegin(currentTerm(), context)) 1374 MATCH_NEXT(); 1375 BACKTRACK(); 1376 case ByteTerm::TypeParentheticalAssertionEnd: 1377 if (backtrackParentheticalAssertionEnd(currentTerm(), context)) 1378 MATCH_NEXT(); 1379 BACKTRACK(); 1380 1381 case ByteTerm::TypeCheckInput: 1382 input.uncheckInput(currentTerm().checkInputCount); 1383 BACKTRACK(); 1384 1385 case ByteTerm::TypeUncheckInput: 1386 input.checkInput(currentTerm().checkInputCount); 1387 BACKTRACK(); 1388 1389 case ByteTerm::TypeDotStarEnclosure: 1390 RELEASE_ASSERT_NOT_REACHED(); 1391 } 1392 1393 RELEASE_ASSERT_NOT_REACHED(); 1394 return JSRegExpErrorNoMatch; 1395 } 1396 1397 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) 1398 { 1399 JSRegExpResult result = matchDisjunction(disjunction, context, btrack); 1400 1401 if (result == JSRegExpMatch) { 1402 while (context->matchBegin == context->matchEnd) { 1403 result = matchDisjunction(disjunction, context, true); 1404 if (result != JSRegExpMatch) 1405 return result; 1406 } 1407 return JSRegExpMatch; 1408 } 1409 1410 return result; 1411 } 1412 1413 unsigned interpret() 1414 { 1415 if (!input.isAvailableInput(0)) 1416 return offsetNoMatch; 1417 1418 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) 1419 output[i << 1] = offsetNoMatch; 1420 1421 allocatorPool = pattern->m_allocator->startAllocator(); 1422 RELEASE_ASSERT(allocatorPool); 1423 1424 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); 1425 1426 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); 1427 if (result == JSRegExpMatch) { 1428 output[0] = context->matchBegin; 1429 output[1] = context->matchEnd; 1430 } 1431 1432 freeDisjunctionContext(context); 1433 1434 pattern->m_allocator->stopAllocator(); 1435 1436 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); 1437 return output[0]; 1438 } 1439 1440 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) 1441 : pattern(pattern) 1442 , output(output) 1443 , input(input, start, length) 1444 , allocatorPool(0) 1445 , remainingMatchCount(matchLimit) 1446 { 1447 } 1448 1449private: 1450 BytecodePattern* pattern; 1451 unsigned* output; 1452 InputStream input; 1453 BumpPointerPool* allocatorPool; 1454 unsigned remainingMatchCount; 1455}; 1456 1457class ByteCompiler { 1458 struct ParenthesesStackEntry { 1459 unsigned beginTerm; 1460 unsigned savedAlternativeIndex; 1461 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) 1462 : beginTerm(beginTerm) 1463 , savedAlternativeIndex(savedAlternativeIndex) 1464 { 1465 } 1466 }; 1467 1468public: 1469 ByteCompiler(YarrPattern& pattern) 1470 : m_pattern(pattern) 1471 { 1472 m_currentAlternativeIndex = 0; 1473 } 1474 1475 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) 1476 { 1477 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); 1478 emitDisjunction(m_pattern.m_body); 1479 regexEnd(); 1480 1481 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); 1482 } 1483 1484 void checkInput(unsigned count) 1485 { 1486 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); 1487 } 1488 1489 void uncheckInput(unsigned count) 1490 { 1491 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); 1492 } 1493 1494 void assertionBOL(unsigned inputPosition) 1495 { 1496 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); 1497 } 1498 1499 void assertionEOL(unsigned inputPosition) 1500 { 1501 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); 1502 } 1503 1504 void assertionWordBoundary(bool invert, unsigned inputPosition) 1505 { 1506 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); 1507 } 1508 1509 void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1510 { 1511 if (m_pattern.m_ignoreCase) { 1512 ASSERT(u_tolower(ch) <= 0xFFFF); 1513 ASSERT(u_toupper(ch) <= 0xFFFF); 1514 1515 UChar lo = u_tolower(ch); 1516 UChar hi = u_toupper(ch); 1517 1518 if (lo != hi) { 1519 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); 1520 return; 1521 } 1522 } 1523 1524 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); 1525 } 1526 1527 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1528 { 1529 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); 1530 1531 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); 1532 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1533 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1534 } 1535 1536 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1537 { 1538 ASSERT(subpatternId); 1539 1540 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); 1541 1542 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); 1543 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1544 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1545 } 1546 1547 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1548 { 1549 int beginTerm = m_bodyDisjunction->terms.size(); 1550 1551 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); 1552 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1553 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1554 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1555 1556 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1557 m_currentAlternativeIndex = beginTerm + 1; 1558 } 1559 1560 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1561 { 1562 int beginTerm = m_bodyDisjunction->terms.size(); 1563 1564 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); 1565 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1566 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1567 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1568 1569 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1570 m_currentAlternativeIndex = beginTerm + 1; 1571 } 1572 1573 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1574 { 1575 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, 1576 // then fix this up at the end! - simplifying this should make it much clearer. 1577 // https://bugs.webkit.org/show_bug.cgi?id=50136 1578 1579 int beginTerm = m_bodyDisjunction->terms.size(); 1580 1581 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); 1582 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1583 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1584 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1585 1586 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1587 m_currentAlternativeIndex = beginTerm + 1; 1588 } 1589 1590 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) 1591 { 1592 int beginTerm = m_bodyDisjunction->terms.size(); 1593 1594 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); 1595 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1596 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1597 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1598 1599 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1600 m_currentAlternativeIndex = beginTerm + 1; 1601 } 1602 1603 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1604 { 1605 unsigned beginTerm = popParenthesesStack(); 1606 closeAlternative(beginTerm + 1); 1607 unsigned endTerm = m_bodyDisjunction->terms.size(); 1608 1609 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); 1610 1611 bool invert = m_bodyDisjunction->terms[beginTerm].invert(); 1612 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1613 1614 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); 1615 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1616 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1617 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1618 1619 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); 1620 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1621 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); 1622 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1623 } 1624 1625 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) 1626 { 1627 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); 1628 } 1629 1630 unsigned popParenthesesStack() 1631 { 1632 ASSERT(m_parenthesesStack.size()); 1633 int stackEnd = m_parenthesesStack.size() - 1; 1634 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; 1635 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; 1636 m_parenthesesStack.shrink(stackEnd); 1637 1638 ASSERT(beginTerm < m_bodyDisjunction->terms.size()); 1639 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); 1640 1641 return beginTerm; 1642 } 1643 1644#ifndef NDEBUG 1645 void dumpDisjunction(ByteDisjunction* disjunction) 1646 { 1647 dataLogF("ByteDisjunction(%p):\n\t", disjunction); 1648 for (unsigned i = 0; i < disjunction->terms.size(); ++i) 1649 dataLogF("{ %d } ", disjunction->terms[i].type); 1650 dataLogF("\n"); 1651 } 1652#endif 1653 1654 void closeAlternative(int beginTerm) 1655 { 1656 int origBeginTerm = beginTerm; 1657 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); 1658 int endIndex = m_bodyDisjunction->terms.size(); 1659 1660 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1661 1662 if (!m_bodyDisjunction->terms[beginTerm].alternative.next) 1663 m_bodyDisjunction->terms.remove(beginTerm); 1664 else { 1665 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1666 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1667 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); 1668 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1669 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1670 } 1671 1672 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1673 1674 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); 1675 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1676 } 1677 } 1678 1679 void closeBodyAlternative() 1680 { 1681 int beginTerm = 0; 1682 int origBeginTerm = 0; 1683 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); 1684 int endIndex = m_bodyDisjunction->terms.size(); 1685 1686 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1687 1688 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1689 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1690 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); 1691 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1692 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1693 } 1694 1695 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1696 1697 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); 1698 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1699 } 1700 1701 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) 1702 { 1703 unsigned beginTerm = popParenthesesStack(); 1704 closeAlternative(beginTerm + 1); 1705 unsigned endTerm = m_bodyDisjunction->terms.size(); 1706 1707 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 1708 1709 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; 1710 1711 bool capture = parenthesesBegin.capture(); 1712 unsigned subpatternId = parenthesesBegin.atom.subpatternId; 1713 1714 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; 1715 OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); 1716 1717 unsigned firstTermInParentheses = beginTerm + 1; 1718 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); 1719 1720 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); 1721 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses) 1722 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); 1723 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); 1724 1725 m_bodyDisjunction->terms.shrink(beginTerm); 1726 1727 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); 1728 m_allParenthesesInfo.append(parenthesesDisjunction.release()); 1729 1730 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); 1731 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1732 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1733 } 1734 1735 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1736 { 1737 unsigned beginTerm = popParenthesesStack(); 1738 closeAlternative(beginTerm + 1); 1739 unsigned endTerm = m_bodyDisjunction->terms.size(); 1740 1741 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 1742 1743 bool capture = m_bodyDisjunction->terms[beginTerm].capture(); 1744 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1745 1746 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); 1747 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1748 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1749 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1750 1751 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); 1752 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1753 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); 1754 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1755 } 1756 1757 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1758 { 1759 unsigned beginTerm = popParenthesesStack(); 1760 closeAlternative(beginTerm + 1); 1761 unsigned endTerm = m_bodyDisjunction->terms.size(); 1762 1763 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 1764 1765 bool capture = m_bodyDisjunction->terms[beginTerm].capture(); 1766 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1767 1768 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); 1769 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1770 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1771 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1772 1773 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); 1774 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1775 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); 1776 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1777 } 1778 1779 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) 1780 { 1781 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); 1782 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); 1783 m_bodyDisjunction->terms[0].frameLocation = 0; 1784 m_currentAlternativeIndex = 0; 1785 } 1786 1787 void regexEnd() 1788 { 1789 closeBodyAlternative(); 1790 } 1791 1792 void alternativeBodyDisjunction(bool onceThrough) 1793 { 1794 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1795 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1796 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); 1797 1798 m_currentAlternativeIndex = newAlternativeIndex; 1799 } 1800 1801 void alternativeDisjunction() 1802 { 1803 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1804 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1805 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); 1806 1807 m_currentAlternativeIndex = newAlternativeIndex; 1808 } 1809 1810 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) 1811 { 1812 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 1813 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; 1814 1815 PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); 1816 1817 if (alt) { 1818 if (disjunction == m_pattern.m_body) 1819 alternativeBodyDisjunction(alternative->onceThrough()); 1820 else 1821 alternativeDisjunction(); 1822 } 1823 1824 unsigned minimumSize = alternative->m_minimumSize; 1825 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); 1826 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; 1827 1828 if (countToCheck) { 1829 checkInput(countToCheck); 1830 currentCountAlreadyChecked += countToCheck; 1831 } 1832 1833 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 1834 PatternTerm& term = alternative->m_terms[i]; 1835 1836 switch (term.type) { 1837 case PatternTerm::TypeAssertionBOL: 1838 assertionBOL(currentCountAlreadyChecked - term.inputPosition); 1839 break; 1840 1841 case PatternTerm::TypeAssertionEOL: 1842 assertionEOL(currentCountAlreadyChecked - term.inputPosition); 1843 break; 1844 1845 case PatternTerm::TypeAssertionWordBoundary: 1846 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition); 1847 break; 1848 1849 case PatternTerm::TypePatternCharacter: 1850 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); 1851 break; 1852 1853 case PatternTerm::TypeCharacterClass: 1854 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); 1855 break; 1856 1857 case PatternTerm::TypeBackReference: 1858 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); 1859 break; 1860 1861 case PatternTerm::TypeForwardReference: 1862 break; 1863 1864 case PatternTerm::TypeParenthesesSubpattern: { 1865 unsigned disjunctionAlreadyCheckedCount = 0; 1866 if (term.quantityCount == 1 && !term.parentheses.isCopy) { 1867 unsigned alternativeFrameLocation = term.frameLocation; 1868 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. 1869 if (term.quantityType == QuantifierFixedCount) 1870 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; 1871 else 1872 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; 1873 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1874 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); 1875 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); 1876 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); 1877 } else if (term.parentheses.isTerminal) { 1878 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1879 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); 1880 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); 1881 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); 1882 } else { 1883 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1884 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); 1885 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); 1886 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); 1887 } 1888 break; 1889 } 1890 1891 case PatternTerm::TypeParentheticalAssertion: { 1892 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; 1893 1894 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); 1895 unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); 1896 unsigned uncheckAmount = 0; 1897 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { 1898 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; 1899 uncheckInput(uncheckAmount); 1900 currentCountAlreadyChecked -= uncheckAmount; 1901 } 1902 1903 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); 1904 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); 1905 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); 1906 if (uncheckAmount) { 1907 checkInput(uncheckAmount); 1908 currentCountAlreadyChecked += uncheckAmount; 1909 } 1910 break; 1911 } 1912 1913 case PatternTerm::TypeDotStarEnclosure: 1914 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); 1915 break; 1916 } 1917 } 1918 } 1919 } 1920 1921private: 1922 YarrPattern& m_pattern; 1923 OwnPtr<ByteDisjunction> m_bodyDisjunction; 1924 unsigned m_currentAlternativeIndex; 1925 Vector<ParenthesesStackEntry> m_parenthesesStack; 1926 Vector<OwnPtr<ByteDisjunction>> m_allParenthesesInfo; 1927}; 1928 1929PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) 1930{ 1931 return ByteCompiler(pattern).compile(allocator); 1932} 1933 1934unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) 1935{ 1936 if (input.is8Bit()) 1937 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); 1938 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); 1939} 1940 1941unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) 1942{ 1943 return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); 1944} 1945 1946unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) 1947{ 1948 return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); 1949} 1950 1951// These should be the same for both UChar & LChar. 1952COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); 1953COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); 1954COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); 1955COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); 1956COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); 1957COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); 1958COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); 1959 1960 1961} } 1962