1/* 2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "YarrPattern.h" 29 30#include "Yarr.h" 31#include "YarrCanonicalizeUCS2.h" 32#include "YarrParser.h" 33#include <wtf/Vector.h> 34 35using namespace WTF; 36 37namespace JSC { namespace Yarr { 38 39#include "RegExpJitTables.h" 40 41class CharacterClassConstructor { 42public: 43 CharacterClassConstructor(bool isCaseInsensitive = false) 44 : m_isCaseInsensitive(isCaseInsensitive) 45 { 46 } 47 48 void reset() 49 { 50 m_matches.clear(); 51 m_ranges.clear(); 52 m_matchesUnicode.clear(); 53 m_rangesUnicode.clear(); 54 } 55 56 void append(const CharacterClass* other) 57 { 58 for (size_t i = 0; i < other->m_matches.size(); ++i) 59 addSorted(m_matches, other->m_matches[i]); 60 for (size_t i = 0; i < other->m_ranges.size(); ++i) 61 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); 62 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) 63 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); 64 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) 65 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); 66 } 67 68 void putChar(UChar ch) 69 { 70 // Handle ascii cases. 71 if (ch <= 0x7f) { 72 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { 73 addSorted(m_matches, toASCIIUpper(ch)); 74 addSorted(m_matches, toASCIILower(ch)); 75 } else 76 addSorted(m_matches, ch); 77 return; 78 } 79 80 // Simple case, not a case-insensitive match. 81 if (!m_isCaseInsensitive) { 82 addSorted(m_matchesUnicode, ch); 83 return; 84 } 85 86 // Add multiple matches, if necessary. 87 const UCS2CanonicalizationRange* info = rangeInfoFor(ch); 88 if (info->type == CanonicalizeUnique) 89 addSorted(m_matchesUnicode, ch); 90 else 91 putUnicodeIgnoreCase(ch, info); 92 } 93 94 void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) 95 { 96 ASSERT(m_isCaseInsensitive); 97 ASSERT(ch > 0x7f); 98 ASSERT(ch >= info->begin && ch <= info->end); 99 ASSERT(info->type != CanonicalizeUnique); 100 if (info->type == CanonicalizeSet) { 101 for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 102 addSorted(m_matchesUnicode, ch); 103 } else { 104 addSorted(m_matchesUnicode, ch); 105 addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); 106 } 107 } 108 109 void putRange(UChar lo, UChar hi) 110 { 111 if (lo <= 0x7f) { 112 char asciiLo = lo; 113 char asciiHi = std::min(hi, (UChar)0x7f); 114 addSortedRange(m_ranges, lo, asciiHi); 115 116 if (m_isCaseInsensitive) { 117 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) 118 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); 119 if ((asciiLo <= 'z') && (asciiHi >= 'a')) 120 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); 121 } 122 } 123 if (hi <= 0x7f) 124 return; 125 126 lo = std::max(lo, (UChar)0x80); 127 addSortedRange(m_rangesUnicode, lo, hi); 128 129 if (!m_isCaseInsensitive) 130 return; 131 132 const UCS2CanonicalizationRange* info = rangeInfoFor(lo); 133 while (true) { 134 // Handle the range [lo .. end] 135 UChar end = std::min<UChar>(info->end, hi); 136 137 switch (info->type) { 138 case CanonicalizeUnique: 139 // Nothing to do - no canonical equivalents. 140 break; 141 case CanonicalizeSet: { 142 UChar ch; 143 for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 144 addSorted(m_matchesUnicode, ch); 145 break; 146 } 147 case CanonicalizeRangeLo: 148 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); 149 break; 150 case CanonicalizeRangeHi: 151 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); 152 break; 153 case CanonicalizeAlternatingAligned: 154 // Use addSortedRange since there is likely an abutting range to combine with. 155 if (lo & 1) 156 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 157 if (!(end & 1)) 158 addSortedRange(m_rangesUnicode, end + 1, end + 1); 159 break; 160 case CanonicalizeAlternatingUnaligned: 161 // Use addSortedRange since there is likely an abutting range to combine with. 162 if (!(lo & 1)) 163 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 164 if (end & 1) 165 addSortedRange(m_rangesUnicode, end + 1, end + 1); 166 break; 167 } 168 169 if (hi == end) 170 return; 171 172 ++info; 173 lo = info->begin; 174 }; 175 176 } 177 178 PassOwnPtr<CharacterClass> charClass() 179 { 180 OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass); 181 182 characterClass->m_matches.swap(m_matches); 183 characterClass->m_ranges.swap(m_ranges); 184 characterClass->m_matchesUnicode.swap(m_matchesUnicode); 185 characterClass->m_rangesUnicode.swap(m_rangesUnicode); 186 187 return characterClass.release(); 188 } 189 190private: 191 void addSorted(Vector<UChar>& matches, UChar ch) 192 { 193 unsigned pos = 0; 194 unsigned range = matches.size(); 195 196 // binary chop, find position to insert char. 197 while (range) { 198 unsigned index = range >> 1; 199 200 int val = matches[pos+index] - ch; 201 if (!val) 202 return; 203 else if (val > 0) 204 range = index; 205 else { 206 pos += (index+1); 207 range -= (index+1); 208 } 209 } 210 211 if (pos == matches.size()) 212 matches.append(ch); 213 else 214 matches.insert(pos, ch); 215 } 216 217 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) 218 { 219 unsigned end = ranges.size(); 220 221 // Simple linear scan - I doubt there are that many ranges anyway... 222 // feel free to fix this with something faster (eg binary chop). 223 for (unsigned i = 0; i < end; ++i) { 224 // does the new range fall before the current position in the array 225 if (hi < ranges[i].begin) { 226 // optional optimization: concatenate appending ranges? - may not be worthwhile. 227 if (hi == (ranges[i].begin - 1)) { 228 ranges[i].begin = lo; 229 return; 230 } 231 ranges.insert(i, CharacterRange(lo, hi)); 232 return; 233 } 234 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 235 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 236 // end of the last range they concatenate, which is just as good. 237 if (lo <= (ranges[i].end + 1)) { 238 // found an intersect! we'll replace this entry in the array. 239 ranges[i].begin = std::min(ranges[i].begin, lo); 240 ranges[i].end = std::max(ranges[i].end, hi); 241 242 // now check if the new range can subsume any subsequent ranges. 243 unsigned next = i+1; 244 // each iteration of the loop we will either remove something from the list, or break the loop. 245 while (next < ranges.size()) { 246 if (ranges[next].begin <= (ranges[i].end + 1)) { 247 // the next entry now overlaps / concatenates this one. 248 ranges[i].end = std::max(ranges[i].end, ranges[next].end); 249 ranges.remove(next); 250 } else 251 break; 252 } 253 254 return; 255 } 256 } 257 258 // CharacterRange comes after all existing ranges. 259 ranges.append(CharacterRange(lo, hi)); 260 } 261 262 bool m_isCaseInsensitive; 263 264 Vector<UChar> m_matches; 265 Vector<CharacterRange> m_ranges; 266 Vector<UChar> m_matchesUnicode; 267 Vector<CharacterRange> m_rangesUnicode; 268}; 269 270class YarrPatternConstructor { 271public: 272 YarrPatternConstructor(YarrPattern& pattern) 273 : m_pattern(pattern) 274 , m_characterClassConstructor(pattern.m_ignoreCase) 275 , m_invertParentheticalAssertion(false) 276 { 277 OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); 278 m_pattern.m_body = body.get(); 279 m_alternative = body->addNewAlternative(); 280 m_pattern.m_disjunctions.append(body.release()); 281 } 282 283 ~YarrPatternConstructor() 284 { 285 } 286 287 void reset() 288 { 289 m_pattern.reset(); 290 m_characterClassConstructor.reset(); 291 292 OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); 293 m_pattern.m_body = body.get(); 294 m_alternative = body->addNewAlternative(); 295 m_pattern.m_disjunctions.append(body.release()); 296 } 297 298 void assertionBOL() 299 { 300 if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { 301 m_alternative->m_startsWithBOL = true; 302 m_alternative->m_containsBOL = true; 303 m_pattern.m_containsBOL = true; 304 } 305 m_alternative->m_terms.append(PatternTerm::BOL()); 306 } 307 void assertionEOL() 308 { 309 m_alternative->m_terms.append(PatternTerm::EOL()); 310 } 311 void assertionWordBoundary(bool invert) 312 { 313 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); 314 } 315 316 void atomPatternCharacter(UChar ch) 317 { 318 // We handle case-insensitive checking of unicode characters which do have both 319 // cases by handling them as if they were defined using a CharacterClass. 320 if (!m_pattern.m_ignoreCase || isASCII(ch)) { 321 m_alternative->m_terms.append(PatternTerm(ch)); 322 return; 323 } 324 325 const UCS2CanonicalizationRange* info = rangeInfoFor(ch); 326 if (info->type == CanonicalizeUnique) { 327 m_alternative->m_terms.append(PatternTerm(ch)); 328 return; 329 } 330 331 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); 332 OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); 333 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); 334 m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); 335 } 336 337 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 338 { 339 switch (classID) { 340 case DigitClassID: 341 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); 342 break; 343 case SpaceClassID: 344 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); 345 break; 346 case WordClassID: 347 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); 348 break; 349 case NewlineClassID: 350 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); 351 break; 352 } 353 } 354 355 void atomCharacterClassBegin(bool invert = false) 356 { 357 m_invertCharacterClass = invert; 358 } 359 360 void atomCharacterClassAtom(UChar ch) 361 { 362 m_characterClassConstructor.putChar(ch); 363 } 364 365 void atomCharacterClassRange(UChar begin, UChar end) 366 { 367 m_characterClassConstructor.putRange(begin, end); 368 } 369 370 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 371 { 372 ASSERT(classID != NewlineClassID); 373 374 switch (classID) { 375 case DigitClassID: 376 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); 377 break; 378 379 case SpaceClassID: 380 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); 381 break; 382 383 case WordClassID: 384 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); 385 break; 386 387 default: 388 RELEASE_ASSERT_NOT_REACHED(); 389 } 390 } 391 392 void atomCharacterClassEnd() 393 { 394 OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); 395 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); 396 m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); 397 } 398 399 void atomParenthesesSubpatternBegin(bool capture = true) 400 { 401 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; 402 if (capture) 403 m_pattern.m_numSubpatterns++; 404 405 OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); 406 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); 407 m_alternative = parenthesesDisjunction->addNewAlternative(); 408 m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); 409 } 410 411 void atomParentheticalAssertionBegin(bool invert = false) 412 { 413 OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); 414 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); 415 m_alternative = parenthesesDisjunction->addNewAlternative(); 416 m_invertParentheticalAssertion = invert; 417 m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); 418 } 419 420 void atomParenthesesEnd() 421 { 422 ASSERT(m_alternative->m_parent); 423 ASSERT(m_alternative->m_parent->m_parent); 424 425 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; 426 m_alternative = m_alternative->m_parent->m_parent; 427 428 PatternTerm& lastTerm = m_alternative->lastTerm(); 429 430 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); 431 unsigned numBOLAnchoredAlts = 0; 432 433 for (unsigned i = 0; i < numParenAlternatives; i++) { 434 // Bubble up BOL flags 435 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) 436 numBOLAnchoredAlts++; 437 } 438 439 if (numBOLAnchoredAlts) { 440 m_alternative->m_containsBOL = true; 441 // If all the alternatives in parens start with BOL, then so does this one 442 if (numBOLAnchoredAlts == numParenAlternatives) 443 m_alternative->m_startsWithBOL = true; 444 } 445 446 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; 447 m_invertParentheticalAssertion = false; 448 } 449 450 void atomBackReference(unsigned subpatternId) 451 { 452 ASSERT(subpatternId); 453 m_pattern.m_containsBackreferences = true; 454 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); 455 456 if (subpatternId > m_pattern.m_numSubpatterns) { 457 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 458 return; 459 } 460 461 PatternAlternative* currentAlternative = m_alternative; 462 ASSERT(currentAlternative); 463 464 // Note to self: if we waited until the AST was baked, we could also remove forwards refs 465 while ((currentAlternative = currentAlternative->m_parent->m_parent)) { 466 PatternTerm& term = currentAlternative->lastTerm(); 467 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); 468 469 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { 470 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 471 return; 472 } 473 } 474 475 m_alternative->m_terms.append(PatternTerm(subpatternId)); 476 } 477 478 // deep copy the argument disjunction. If filterStartsWithBOL is true, 479 // skip alternatives with m_startsWithBOL set true. 480 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) 481 { 482 OwnPtr<PatternDisjunction> newDisjunction; 483 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 484 PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); 485 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { 486 if (!newDisjunction) { 487 newDisjunction = adoptPtr(new PatternDisjunction()); 488 newDisjunction->m_parent = disjunction->m_parent; 489 } 490 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); 491 newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size()); 492 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) 493 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); 494 } 495 } 496 497 if (!newDisjunction) 498 return 0; 499 500 PatternDisjunction* copiedDisjunction = newDisjunction.get(); 501 m_pattern.m_disjunctions.append(newDisjunction.release()); 502 return copiedDisjunction; 503 } 504 505 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) 506 { 507 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) 508 return PatternTerm(term); 509 510 PatternTerm termCopy = term; 511 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); 512 return termCopy; 513 } 514 515 void quantifyAtom(unsigned min, unsigned max, bool greedy) 516 { 517 ASSERT(min <= max); 518 ASSERT(m_alternative->m_terms.size()); 519 520 if (!max) { 521 m_alternative->removeLastTerm(); 522 return; 523 } 524 525 PatternTerm& term = m_alternative->lastTerm(); 526 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); 527 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); 528 529 if (term.type == PatternTerm::TypeParentheticalAssertion) { 530 // If an assertion is quantified with a minimum count of zero, it can simply be removed. 531 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never 532 // results in any input being consumed, however the continuation passed to the assertion 533 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will 534 // reject all zero length matches (see step 2.1). A match from the continuation of the 535 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all 536 // this is that matches from the assertion are not required, and won't be accepted anyway, 537 // so no need to ever run it. 538 if (!min) 539 m_alternative->removeLastTerm(); 540 // We never need to run an assertion more than once. Subsequent interations will be run 541 // with the same start index (since assertions are non-capturing) and the same captures 542 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the 543 // same result and captures. If the first match succeeds then the subsequent (min - 1) 544 // matches will too. Any additional optional matches will fail (on the same basis as the 545 // minimum zero quantified assertions, above), but this will still result in a match. 546 return; 547 } 548 549 if (min == 0) 550 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); 551 else if (min == max) 552 term.quantify(min, QuantifierFixedCount); 553 else { 554 term.quantify(min, QuantifierFixedCount); 555 m_alternative->m_terms.append(copyTerm(term)); 556 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... 557 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); 558 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) 559 m_alternative->lastTerm().parentheses.isCopy = true; 560 } 561 } 562 563 void disjunction() 564 { 565 m_alternative = m_alternative->m_parent->addNewAlternative(); 566 } 567 568 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) 569 { 570 alternative->m_hasFixedSize = true; 571 Checked<unsigned> currentInputPosition = initialInputPosition; 572 573 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 574 PatternTerm& term = alternative->m_terms[i]; 575 576 switch (term.type) { 577 case PatternTerm::TypeAssertionBOL: 578 case PatternTerm::TypeAssertionEOL: 579 case PatternTerm::TypeAssertionWordBoundary: 580 term.inputPosition = currentInputPosition.unsafeGet(); 581 break; 582 583 case PatternTerm::TypeBackReference: 584 term.inputPosition = currentInputPosition.unsafeGet(); 585 term.frameLocation = currentCallFrameSize; 586 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; 587 alternative->m_hasFixedSize = false; 588 break; 589 590 case PatternTerm::TypeForwardReference: 591 break; 592 593 case PatternTerm::TypePatternCharacter: 594 term.inputPosition = currentInputPosition.unsafeGet(); 595 if (term.quantityType != QuantifierFixedCount) { 596 term.frameLocation = currentCallFrameSize; 597 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; 598 alternative->m_hasFixedSize = false; 599 } else 600 currentInputPosition += term.quantityCount; 601 break; 602 603 case PatternTerm::TypeCharacterClass: 604 term.inputPosition = currentInputPosition.unsafeGet(); 605 if (term.quantityType != QuantifierFixedCount) { 606 term.frameLocation = currentCallFrameSize; 607 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; 608 alternative->m_hasFixedSize = false; 609 } else 610 currentInputPosition += term.quantityCount; 611 break; 612 613 case PatternTerm::TypeParenthesesSubpattern: 614 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. 615 term.frameLocation = currentCallFrameSize; 616 if (term.quantityCount == 1 && !term.parentheses.isCopy) { 617 if (term.quantityType != QuantifierFixedCount) 618 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; 619 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); 620 // If quantity is fixed, then pre-check its minimum size. 621 if (term.quantityType == QuantifierFixedCount) 622 currentInputPosition += term.parentheses.disjunction->m_minimumSize; 623 term.inputPosition = currentInputPosition.unsafeGet(); 624 } else if (term.parentheses.isTerminal) { 625 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; 626 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); 627 term.inputPosition = currentInputPosition.unsafeGet(); 628 } else { 629 term.inputPosition = currentInputPosition.unsafeGet(); 630 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); 631 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; 632 } 633 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. 634 alternative->m_hasFixedSize = false; 635 break; 636 637 case PatternTerm::TypeParentheticalAssertion: 638 term.inputPosition = currentInputPosition.unsafeGet(); 639 term.frameLocation = currentCallFrameSize; 640 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); 641 break; 642 643 case PatternTerm::TypeDotStarEnclosure: 644 alternative->m_hasFixedSize = false; 645 term.inputPosition = initialInputPosition; 646 break; 647 } 648 } 649 650 alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); 651 return currentCallFrameSize; 652 } 653 654 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) 655 { 656 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) 657 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; 658 659 unsigned minimumInputSize = UINT_MAX; 660 unsigned maximumCallFrameSize = 0; 661 bool hasFixedSize = true; 662 663 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 664 PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); 665 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); 666 minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); 667 maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); 668 hasFixedSize &= alternative->m_hasFixedSize; 669 if (alternative->m_minimumSize > INT_MAX) 670 m_pattern.m_containsUnsignedLengthPattern = true; 671 } 672 673 ASSERT(minimumInputSize != UINT_MAX); 674 ASSERT(maximumCallFrameSize >= initialCallFrameSize); 675 676 disjunction->m_hasFixedSize = hasFixedSize; 677 disjunction->m_minimumSize = minimumInputSize; 678 disjunction->m_callFrameSize = maximumCallFrameSize; 679 return maximumCallFrameSize; 680 } 681 682 void setupOffsets() 683 { 684 setupDisjunctionOffsets(m_pattern.m_body, 0, 0); 685 } 686 687 // This optimization identifies sets of parentheses that we will never need to backtrack. 688 // In these cases we do not need to store state from prior iterations. 689 // We can presently avoid backtracking for: 690 // * where the parens are at the end of the regular expression (last term in any of the 691 // alternatives of the main body disjunction). 692 // * where the parens are non-capturing, and quantified unbounded greedy (*). 693 // * where the parens do not contain any capturing subpatterns. 694 void checkForTerminalParentheses() 695 { 696 // This check is much too crude; should be just checking whether the candidate 697 // node contains nested capturing subpatterns, not the whole expression! 698 if (m_pattern.m_numSubpatterns) 699 return; 700 701 Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; 702 for (size_t i = 0; i < alternatives.size(); ++i) { 703 Vector<PatternTerm>& terms = alternatives[i]->m_terms; 704 if (terms.size()) { 705 PatternTerm& term = terms.last(); 706 if (term.type == PatternTerm::TypeParenthesesSubpattern 707 && term.quantityType == QuantifierGreedy 708 && term.quantityCount == quantifyInfinite 709 && !term.capture()) 710 term.parentheses.isTerminal = true; 711 } 712 } 713 } 714 715 void optimizeBOL() 716 { 717 // Look for expressions containing beginning of line (^) anchoring and unroll them. 718 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops 719 // This code relies on the parsing code tagging alternatives with m_containsBOL and 720 // m_startsWithBOL and rolling those up to containing alternatives. 721 // At this point, this is only valid for non-multiline expressions. 722 PatternDisjunction* disjunction = m_pattern.m_body; 723 724 if (!m_pattern.m_containsBOL || m_pattern.m_multiline) 725 return; 726 727 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); 728 729 // Set alternatives in disjunction to "onceThrough" 730 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) 731 disjunction->m_alternatives[alt]->setOnceThrough(); 732 733 if (loopDisjunction) { 734 // Move alternatives from loopDisjunction to disjunction 735 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) 736 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release()); 737 738 loopDisjunction->m_alternatives.clear(); 739 } 740 } 741 742 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) 743 { 744 Vector<PatternTerm>& terms = alternative->m_terms; 745 746 for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { 747 PatternTerm& term = terms[termIndex]; 748 749 if (term.m_capture) 750 return true; 751 752 if (term.type == PatternTerm::TypeParenthesesSubpattern) { 753 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; 754 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { 755 if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) 756 return true; 757 } 758 } 759 } 760 761 return false; 762 } 763 764 // This optimization identifies alternatives in the form of 765 // [^].*[?]<expression>.*[$] for expressions that don't have any 766 // capturing terms. The alternative is changed to <expression> 767 // followed by processing of the dot stars to find and adjust the 768 // beginning and the end of the match. 769 void optimizeDotStarWrappedExpressions() 770 { 771 Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; 772 if (alternatives.size() != 1) 773 return; 774 775 PatternAlternative* alternative = alternatives[0].get(); 776 Vector<PatternTerm>& terms = alternative->m_terms; 777 if (terms.size() >= 3) { 778 bool startsWithBOL = false; 779 bool endsWithEOL = false; 780 size_t termIndex, firstExpressionTerm, lastExpressionTerm; 781 782 termIndex = 0; 783 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { 784 startsWithBOL = true; 785 ++termIndex; 786 } 787 788 PatternTerm& firstNonAnchorTerm = terms[termIndex]; 789 if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) 790 return; 791 792 firstExpressionTerm = termIndex + 1; 793 794 termIndex = terms.size() - 1; 795 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { 796 endsWithEOL = true; 797 --termIndex; 798 } 799 800 PatternTerm& lastNonAnchorTerm = terms[termIndex]; 801 if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) 802 return; 803 804 lastExpressionTerm = termIndex - 1; 805 806 if (firstExpressionTerm > lastExpressionTerm) 807 return; 808 809 if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { 810 for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) 811 terms.remove(termIndex); 812 813 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) 814 terms.remove(termIndex - 1); 815 816 terms.append(PatternTerm(startsWithBOL, endsWithEOL)); 817 818 m_pattern.m_containsBOL = false; 819 } 820 } 821 } 822 823private: 824 YarrPattern& m_pattern; 825 PatternAlternative* m_alternative; 826 CharacterClassConstructor m_characterClassConstructor; 827 bool m_invertCharacterClass; 828 bool m_invertParentheticalAssertion; 829}; 830 831const char* YarrPattern::compile(const String& patternString) 832{ 833 YarrPatternConstructor constructor(*this); 834 835 if (const char* error = parse(constructor, patternString)) 836 return error; 837 838 // If the pattern contains illegal backreferences reset & reparse. 839 // Quoting Netscape's "What's new in JavaScript 1.2", 840 // "Note: if the number of left parentheses is less than the number specified 841 // in \#, the \# is taken as an octal escape as described in the next row." 842 if (containsIllegalBackReference()) { 843 unsigned numSubpatterns = m_numSubpatterns; 844 845 constructor.reset(); 846#if !ASSERT_DISABLED 847 const char* error = 848#endif 849 parse(constructor, patternString, numSubpatterns); 850 851 ASSERT(!error); 852 ASSERT(numSubpatterns == m_numSubpatterns); 853 } 854 855 constructor.checkForTerminalParentheses(); 856 constructor.optimizeDotStarWrappedExpressions(); 857 constructor.optimizeBOL(); 858 859 constructor.setupOffsets(); 860 861 return 0; 862} 863 864YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) 865 : m_ignoreCase(ignoreCase) 866 , m_multiline(multiline) 867 , m_containsBackreferences(false) 868 , m_containsBOL(false) 869 , m_containsUnsignedLengthPattern(false) 870 , m_numSubpatterns(0) 871 , m_maxBackReference(0) 872 , newlineCached(0) 873 , digitsCached(0) 874 , spacesCached(0) 875 , wordcharCached(0) 876 , nondigitsCached(0) 877 , nonspacesCached(0) 878 , nonwordcharCached(0) 879{ 880 *error = compile(pattern); 881} 882 883} } 884