1/* 2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#ifndef YarrPattern_h 28#define YarrPattern_h 29 30#include <wtf/CheckedArithmetic.h> 31#include <wtf/OwnPtr.h> 32#include <wtf/PassOwnPtr.h> 33#include <wtf/RefCounted.h> 34#include <wtf/Vector.h> 35#include <wtf/text/WTFString.h> 36 37namespace JSC { namespace Yarr { 38 39struct PatternDisjunction; 40 41struct CharacterRange { 42 UChar begin; 43 UChar end; 44 45 CharacterRange(UChar begin, UChar end) 46 : begin(begin) 47 , end(end) 48 { 49 } 50}; 51 52struct CharacterClass { 53 WTF_MAKE_FAST_ALLOCATED; 54public: 55 // All CharacterClass instances have to have the full set of matches and ranges, 56 // they may have an optional m_table for faster lookups (which must match the 57 // specified matches and ranges) 58 CharacterClass() 59 : m_table(0) 60 { 61 } 62 CharacterClass(const char* table, bool inverted) 63 : m_table(table) 64 , m_tableInverted(inverted) 65 { 66 } 67 Vector<UChar> m_matches; 68 Vector<CharacterRange> m_ranges; 69 Vector<UChar> m_matchesUnicode; 70 Vector<CharacterRange> m_rangesUnicode; 71 72 const char* m_table; 73 bool m_tableInverted; 74}; 75 76enum QuantifierType { 77 QuantifierFixedCount, 78 QuantifierGreedy, 79 QuantifierNonGreedy, 80}; 81 82struct PatternTerm { 83 enum Type { 84 TypeAssertionBOL, 85 TypeAssertionEOL, 86 TypeAssertionWordBoundary, 87 TypePatternCharacter, 88 TypeCharacterClass, 89 TypeBackReference, 90 TypeForwardReference, 91 TypeParenthesesSubpattern, 92 TypeParentheticalAssertion, 93 TypeDotStarEnclosure, 94 } type; 95 bool m_capture :1; 96 bool m_invert :1; 97 union { 98 UChar patternCharacter; 99 CharacterClass* characterClass; 100 unsigned backReferenceSubpatternId; 101 struct { 102 PatternDisjunction* disjunction; 103 unsigned subpatternId; 104 unsigned lastSubpatternId; 105 bool isCopy; 106 bool isTerminal; 107 } parentheses; 108 struct { 109 bool bolAnchor : 1; 110 bool eolAnchor : 1; 111 } anchors; 112 }; 113 QuantifierType quantityType; 114 Checked<unsigned> quantityCount; 115 int inputPosition; 116 unsigned frameLocation; 117 118 PatternTerm(UChar ch) 119 : type(PatternTerm::TypePatternCharacter) 120 , m_capture(false) 121 , m_invert(false) 122 { 123 patternCharacter = ch; 124 quantityType = QuantifierFixedCount; 125 quantityCount = 1; 126 } 127 128 PatternTerm(CharacterClass* charClass, bool invert) 129 : type(PatternTerm::TypeCharacterClass) 130 , m_capture(false) 131 , m_invert(invert) 132 { 133 characterClass = charClass; 134 quantityType = QuantifierFixedCount; 135 quantityCount = 1; 136 } 137 138 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) 139 : type(type) 140 , m_capture(capture) 141 , m_invert(invert) 142 { 143 parentheses.disjunction = disjunction; 144 parentheses.subpatternId = subpatternId; 145 parentheses.isCopy = false; 146 parentheses.isTerminal = false; 147 quantityType = QuantifierFixedCount; 148 quantityCount = 1; 149 } 150 151 PatternTerm(Type type, bool invert = false) 152 : type(type) 153 , m_capture(false) 154 , m_invert(invert) 155 { 156 quantityType = QuantifierFixedCount; 157 quantityCount = 1; 158 } 159 160 PatternTerm(unsigned spatternId) 161 : type(TypeBackReference) 162 , m_capture(false) 163 , m_invert(false) 164 { 165 backReferenceSubpatternId = spatternId; 166 quantityType = QuantifierFixedCount; 167 quantityCount = 1; 168 } 169 170 PatternTerm(bool bolAnchor, bool eolAnchor) 171 : type(TypeDotStarEnclosure) 172 , m_capture(false) 173 , m_invert(false) 174 { 175 anchors.bolAnchor = bolAnchor; 176 anchors.eolAnchor = eolAnchor; 177 quantityType = QuantifierFixedCount; 178 quantityCount = 1; 179 } 180 181 static PatternTerm ForwardReference() 182 { 183 return PatternTerm(TypeForwardReference); 184 } 185 186 static PatternTerm BOL() 187 { 188 return PatternTerm(TypeAssertionBOL); 189 } 190 191 static PatternTerm EOL() 192 { 193 return PatternTerm(TypeAssertionEOL); 194 } 195 196 static PatternTerm WordBoundary(bool invert) 197 { 198 return PatternTerm(TypeAssertionWordBoundary, invert); 199 } 200 201 bool invert() 202 { 203 return m_invert; 204 } 205 206 bool capture() 207 { 208 return m_capture; 209 } 210 211 void quantify(unsigned count, QuantifierType type) 212 { 213 quantityCount = count; 214 quantityType = type; 215 } 216}; 217 218struct PatternAlternative { 219 WTF_MAKE_FAST_ALLOCATED; 220public: 221 PatternAlternative(PatternDisjunction* disjunction) 222 : m_parent(disjunction) 223 , m_onceThrough(false) 224 , m_hasFixedSize(false) 225 , m_startsWithBOL(false) 226 , m_containsBOL(false) 227 { 228 } 229 230 PatternTerm& lastTerm() 231 { 232 ASSERT(m_terms.size()); 233 return m_terms[m_terms.size() - 1]; 234 } 235 236 void removeLastTerm() 237 { 238 ASSERT(m_terms.size()); 239 m_terms.shrink(m_terms.size() - 1); 240 } 241 242 void setOnceThrough() 243 { 244 m_onceThrough = true; 245 } 246 247 bool onceThrough() 248 { 249 return m_onceThrough; 250 } 251 252 Vector<PatternTerm> m_terms; 253 PatternDisjunction* m_parent; 254 unsigned m_minimumSize; 255 bool m_onceThrough : 1; 256 bool m_hasFixedSize : 1; 257 bool m_startsWithBOL : 1; 258 bool m_containsBOL : 1; 259}; 260 261struct PatternDisjunction { 262 WTF_MAKE_FAST_ALLOCATED; 263public: 264 PatternDisjunction(PatternAlternative* parent = 0) 265 : m_parent(parent) 266 , m_hasFixedSize(false) 267 { 268 } 269 270 PatternAlternative* addNewAlternative() 271 { 272 PatternAlternative* alternative = new PatternAlternative(this); 273 m_alternatives.append(adoptPtr(alternative)); 274 return alternative; 275 } 276 277 Vector<OwnPtr<PatternAlternative>> m_alternatives; 278 PatternAlternative* m_parent; 279 unsigned m_minimumSize; 280 unsigned m_callFrameSize; 281 bool m_hasFixedSize; 282}; 283 284// You probably don't want to be calling these functions directly 285// (please to be calling newlineCharacterClass() et al on your 286// friendly neighborhood YarrPattern instance to get nicely 287// cached copies). 288CharacterClass* newlineCreate(); 289CharacterClass* digitsCreate(); 290CharacterClass* spacesCreate(); 291CharacterClass* wordcharCreate(); 292CharacterClass* nondigitsCreate(); 293CharacterClass* nonspacesCreate(); 294CharacterClass* nonwordcharCreate(); 295 296struct TermChain { 297 TermChain(PatternTerm term) 298 : term(term) 299 {} 300 301 PatternTerm term; 302 Vector<TermChain> hotTerms; 303}; 304 305struct YarrPattern { 306 JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); 307 308 void reset() 309 { 310 m_numSubpatterns = 0; 311 m_maxBackReference = 0; 312 313 m_containsBackreferences = false; 314 m_containsBOL = false; 315 m_containsUnsignedLengthPattern = false; 316 317 newlineCached = 0; 318 digitsCached = 0; 319 spacesCached = 0; 320 wordcharCached = 0; 321 nondigitsCached = 0; 322 nonspacesCached = 0; 323 nonwordcharCached = 0; 324 325 m_disjunctions.clear(); 326 m_userCharacterClasses.clear(); 327 } 328 329 bool containsIllegalBackReference() 330 { 331 return m_maxBackReference > m_numSubpatterns; 332 } 333 334 bool containsUnsignedLengthPattern() 335 { 336 return m_containsUnsignedLengthPattern; 337 } 338 339 CharacterClass* newlineCharacterClass() 340 { 341 if (!newlineCached) 342 m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate())); 343 return newlineCached; 344 } 345 CharacterClass* digitsCharacterClass() 346 { 347 if (!digitsCached) 348 m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate())); 349 return digitsCached; 350 } 351 CharacterClass* spacesCharacterClass() 352 { 353 if (!spacesCached) 354 m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate())); 355 return spacesCached; 356 } 357 CharacterClass* wordcharCharacterClass() 358 { 359 if (!wordcharCached) 360 m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate())); 361 return wordcharCached; 362 } 363 CharacterClass* nondigitsCharacterClass() 364 { 365 if (!nondigitsCached) 366 m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate())); 367 return nondigitsCached; 368 } 369 CharacterClass* nonspacesCharacterClass() 370 { 371 if (!nonspacesCached) 372 m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate())); 373 return nonspacesCached; 374 } 375 CharacterClass* nonwordcharCharacterClass() 376 { 377 if (!nonwordcharCached) 378 m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate())); 379 return nonwordcharCached; 380 } 381 382 bool m_ignoreCase : 1; 383 bool m_multiline : 1; 384 bool m_containsBackreferences : 1; 385 bool m_containsBOL : 1; 386 bool m_containsUnsignedLengthPattern : 1; 387 unsigned m_numSubpatterns; 388 unsigned m_maxBackReference; 389 PatternDisjunction* m_body; 390 Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions; 391 Vector<OwnPtr<CharacterClass>> m_userCharacterClasses; 392 393private: 394 const char* compile(const String& patternString); 395 396 CharacterClass* newlineCached; 397 CharacterClass* digitsCached; 398 CharacterClass* spacesCached; 399 CharacterClass* wordcharCached; 400 CharacterClass* nondigitsCached; 401 CharacterClass* nonspacesCached; 402 CharacterClass* nonwordcharCached; 403}; 404 405} } // namespace JSC::Yarr 406 407#endif // YarrPattern_h 408