1/* 2 * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#ifndef YarrInterpreter_h 27#define YarrInterpreter_h 28 29#include "YarrPattern.h" 30#include <wtf/PassOwnPtr.h> 31 32namespace WTF { 33class BumpPointerAllocator; 34} 35using WTF::BumpPointerAllocator; 36 37namespace JSC { namespace Yarr { 38 39class ByteDisjunction; 40 41struct ByteTerm { 42 enum Type { 43 TypeBodyAlternativeBegin, 44 TypeBodyAlternativeDisjunction, 45 TypeBodyAlternativeEnd, 46 TypeAlternativeBegin, 47 TypeAlternativeDisjunction, 48 TypeAlternativeEnd, 49 TypeSubpatternBegin, 50 TypeSubpatternEnd, 51 TypeAssertionBOL, 52 TypeAssertionEOL, 53 TypeAssertionWordBoundary, 54 TypePatternCharacterOnce, 55 TypePatternCharacterFixed, 56 TypePatternCharacterGreedy, 57 TypePatternCharacterNonGreedy, 58 TypePatternCasedCharacterOnce, 59 TypePatternCasedCharacterFixed, 60 TypePatternCasedCharacterGreedy, 61 TypePatternCasedCharacterNonGreedy, 62 TypeCharacterClass, 63 TypeBackReference, 64 TypeParenthesesSubpattern, 65 TypeParenthesesSubpatternOnceBegin, 66 TypeParenthesesSubpatternOnceEnd, 67 TypeParenthesesSubpatternTerminalBegin, 68 TypeParenthesesSubpatternTerminalEnd, 69 TypeParentheticalAssertionBegin, 70 TypeParentheticalAssertionEnd, 71 TypeCheckInput, 72 TypeUncheckInput, 73 TypeDotStarEnclosure, 74 } type; 75 union { 76 struct { 77 union { 78 UChar patternCharacter; 79 struct { 80 UChar lo; 81 UChar hi; 82 } casedCharacter; 83 CharacterClass* characterClass; 84 unsigned subpatternId; 85 }; 86 union { 87 ByteDisjunction* parenthesesDisjunction; 88 unsigned parenthesesWidth; 89 }; 90 QuantifierType quantityType; 91 unsigned quantityCount; 92 } atom; 93 struct { 94 int next; 95 int end; 96 bool onceThrough; 97 } alternative; 98 struct { 99 bool m_bol : 1; 100 bool m_eol : 1; 101 } anchors; 102 unsigned checkInputCount; 103 }; 104 unsigned frameLocation; 105 bool m_capture : 1; 106 bool m_invert : 1; 107 unsigned inputPosition; 108 109 ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 110 : frameLocation(frameLocation) 111 , m_capture(false) 112 , m_invert(false) 113 { 114 switch (quantityType) { 115 case QuantifierFixedCount: 116 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; 117 break; 118 case QuantifierGreedy: 119 type = ByteTerm::TypePatternCharacterGreedy; 120 break; 121 case QuantifierNonGreedy: 122 type = ByteTerm::TypePatternCharacterNonGreedy; 123 break; 124 } 125 126 atom.patternCharacter = ch; 127 atom.quantityType = quantityType; 128 atom.quantityCount = quantityCount.unsafeGet(); 129 inputPosition = inputPos; 130 } 131 132 ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 133 : frameLocation(frameLocation) 134 , m_capture(false) 135 , m_invert(false) 136 { 137 switch (quantityType) { 138 case QuantifierFixedCount: 139 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; 140 break; 141 case QuantifierGreedy: 142 type = ByteTerm::TypePatternCasedCharacterGreedy; 143 break; 144 case QuantifierNonGreedy: 145 type = ByteTerm::TypePatternCasedCharacterNonGreedy; 146 break; 147 } 148 149 atom.casedCharacter.lo = lo; 150 atom.casedCharacter.hi = hi; 151 atom.quantityType = quantityType; 152 atom.quantityCount = quantityCount.unsafeGet(); 153 inputPosition = inputPos; 154 } 155 156 ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) 157 : type(ByteTerm::TypeCharacterClass) 158 , m_capture(false) 159 , m_invert(invert) 160 { 161 atom.characterClass = characterClass; 162 atom.quantityType = QuantifierFixedCount; 163 atom.quantityCount = 1; 164 inputPosition = inputPos; 165 } 166 167 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) 168 : type(type) 169 , m_capture(capture) 170 , m_invert(false) 171 { 172 atom.subpatternId = subpatternId; 173 atom.parenthesesDisjunction = parenthesesInfo; 174 atom.quantityType = QuantifierFixedCount; 175 atom.quantityCount = 1; 176 inputPosition = inputPos; 177 } 178 179 ByteTerm(Type type, bool invert = false) 180 : type(type) 181 , m_capture(false) 182 , m_invert(invert) 183 { 184 atom.quantityType = QuantifierFixedCount; 185 atom.quantityCount = 1; 186 } 187 188 ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) 189 : type(type) 190 , m_capture(capture) 191 , m_invert(invert) 192 { 193 atom.subpatternId = subpatternId; 194 atom.quantityType = QuantifierFixedCount; 195 atom.quantityCount = 1; 196 inputPosition = inputPos; 197 } 198 199 static ByteTerm BOL(int inputPos) 200 { 201 ByteTerm term(TypeAssertionBOL); 202 term.inputPosition = inputPos; 203 return term; 204 } 205 206 static ByteTerm CheckInput(Checked<unsigned> count) 207 { 208 ByteTerm term(TypeCheckInput); 209 term.checkInputCount = count.unsafeGet(); 210 return term; 211 } 212 213 static ByteTerm UncheckInput(Checked<unsigned> count) 214 { 215 ByteTerm term(TypeUncheckInput); 216 term.checkInputCount = count.unsafeGet(); 217 return term; 218 } 219 220 static ByteTerm EOL(int inputPos) 221 { 222 ByteTerm term(TypeAssertionEOL); 223 term.inputPosition = inputPos; 224 return term; 225 } 226 227 static ByteTerm WordBoundary(bool invert, int inputPos) 228 { 229 ByteTerm term(TypeAssertionWordBoundary, invert); 230 term.inputPosition = inputPos; 231 return term; 232 } 233 234 static ByteTerm BackReference(unsigned subpatternId, int inputPos) 235 { 236 return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); 237 } 238 239 static ByteTerm BodyAlternativeBegin(bool onceThrough) 240 { 241 ByteTerm term(TypeBodyAlternativeBegin); 242 term.alternative.next = 0; 243 term.alternative.end = 0; 244 term.alternative.onceThrough = onceThrough; 245 return term; 246 } 247 248 static ByteTerm BodyAlternativeDisjunction(bool onceThrough) 249 { 250 ByteTerm term(TypeBodyAlternativeDisjunction); 251 term.alternative.next = 0; 252 term.alternative.end = 0; 253 term.alternative.onceThrough = onceThrough; 254 return term; 255 } 256 257 static ByteTerm BodyAlternativeEnd() 258 { 259 ByteTerm term(TypeBodyAlternativeEnd); 260 term.alternative.next = 0; 261 term.alternative.end = 0; 262 term.alternative.onceThrough = false; 263 return term; 264 } 265 266 static ByteTerm AlternativeBegin() 267 { 268 ByteTerm term(TypeAlternativeBegin); 269 term.alternative.next = 0; 270 term.alternative.end = 0; 271 term.alternative.onceThrough = false; 272 return term; 273 } 274 275 static ByteTerm AlternativeDisjunction() 276 { 277 ByteTerm term(TypeAlternativeDisjunction); 278 term.alternative.next = 0; 279 term.alternative.end = 0; 280 term.alternative.onceThrough = false; 281 return term; 282 } 283 284 static ByteTerm AlternativeEnd() 285 { 286 ByteTerm term(TypeAlternativeEnd); 287 term.alternative.next = 0; 288 term.alternative.end = 0; 289 term.alternative.onceThrough = false; 290 return term; 291 } 292 293 static ByteTerm SubpatternBegin() 294 { 295 return ByteTerm(TypeSubpatternBegin); 296 } 297 298 static ByteTerm SubpatternEnd() 299 { 300 return ByteTerm(TypeSubpatternEnd); 301 } 302 303 static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) 304 { 305 ByteTerm term(TypeDotStarEnclosure); 306 term.anchors.m_bol = bolAnchor; 307 term.anchors.m_eol = eolAnchor; 308 return term; 309 } 310 311 bool invert() 312 { 313 return m_invert; 314 } 315 316 bool capture() 317 { 318 return m_capture; 319 } 320}; 321 322class ByteDisjunction { 323 WTF_MAKE_FAST_ALLOCATED; 324public: 325 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) 326 : m_numSubpatterns(numSubpatterns) 327 , m_frameSize(frameSize) 328 { 329 } 330 331 Vector<ByteTerm> terms; 332 unsigned m_numSubpatterns; 333 unsigned m_frameSize; 334}; 335 336struct BytecodePattern { 337 WTF_MAKE_FAST_ALLOCATED; 338public: 339 BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator) 340 : m_body(body) 341 , m_ignoreCase(pattern.m_ignoreCase) 342 , m_multiline(pattern.m_multiline) 343 , m_allocator(allocator) 344 { 345 m_body->terms.shrinkToFit(); 346 347 newlineCharacterClass = pattern.newlineCharacterClass(); 348 wordcharCharacterClass = pattern.wordcharCharacterClass(); 349 350 m_allParenthesesInfo.swap(parenthesesInfoToAdopt); 351 m_allParenthesesInfo.shrinkToFit(); 352 353 m_userCharacterClasses.swap(pattern.m_userCharacterClasses); 354 m_userCharacterClasses.shrinkToFit(); 355 } 356 357 OwnPtr<ByteDisjunction> m_body; 358 bool m_ignoreCase; 359 bool m_multiline; 360 // Each BytecodePattern is associated with a RegExp, each RegExp is associated 361 // with a VM. Cache a pointer to out VM's m_regExpAllocator. 362 BumpPointerAllocator* m_allocator; 363 364 CharacterClass* newlineCharacterClass; 365 CharacterClass* wordcharCharacterClass; 366 367private: 368 Vector<OwnPtr<ByteDisjunction>> m_allParenthesesInfo; 369 Vector<OwnPtr<CharacterClass>> m_userCharacterClasses; 370}; 371 372JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); 373JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); 374unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); 375unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); 376 377} } // namespace JSC::Yarr 378 379#endif // YarrInterpreter_h 380