1/* 2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#ifndef YarrPattern_h 28#define YarrPattern_h 29 30#include <wtf/CheckedArithmetic.h> 31#include <wtf/OwnPtr.h> 32#include <wtf/PassOwnPtr.h> 33#include <wtf/RefCounted.h> 34#include <wtf/Vector.h> 35#include <wtf/text/WTFString.h> 36#include <wtf/unicode/Unicode.h> 37 38namespace JSC { namespace Yarr { 39 40struct PatternDisjunction; 41 42struct CharacterRange { 43 UChar begin; 44 UChar end; 45 46 CharacterRange(UChar begin, UChar end) 47 : begin(begin) 48 , end(end) 49 { 50 } 51}; 52 53struct CharacterClass { 54 WTF_MAKE_FAST_ALLOCATED; 55public: 56 // All CharacterClass instances have to have the full set of matches and ranges, 57 // they may have an optional m_table for faster lookups (which must match the 58 // specified matches and ranges) 59 CharacterClass() 60 : m_table(0) 61 { 62 } 63 CharacterClass(const char* table, bool inverted) 64 : m_table(table) 65 , m_tableInverted(inverted) 66 { 67 } 68 Vector<UChar> m_matches; 69 Vector<CharacterRange> m_ranges; 70 Vector<UChar> m_matchesUnicode; 71 Vector<CharacterRange> m_rangesUnicode; 72 73 const char* m_table; 74 bool m_tableInverted; 75}; 76 77enum QuantifierType { 78 QuantifierFixedCount, 79 QuantifierGreedy, 80 QuantifierNonGreedy, 81}; 82 83struct PatternTerm { 84 enum Type { 85 TypeAssertionBOL, 86 TypeAssertionEOL, 87 TypeAssertionWordBoundary, 88 TypePatternCharacter, 89 TypeCharacterClass, 90 TypeBackReference, 91 TypeForwardReference, 92 TypeParenthesesSubpattern, 93 TypeParentheticalAssertion, 94 TypeDotStarEnclosure, 95 } type; 96 bool m_capture :1; 97 bool m_invert :1; 98 union { 99 UChar patternCharacter; 100 CharacterClass* characterClass; 101 unsigned backReferenceSubpatternId; 102 struct { 103 PatternDisjunction* disjunction; 104 unsigned subpatternId; 105 unsigned lastSubpatternId; 106 bool isCopy; 107 bool isTerminal; 108 } parentheses; 109 struct { 110 bool bolAnchor : 1; 111 bool eolAnchor : 1; 112 } anchors; 113 }; 114 QuantifierType quantityType; 115 Checked<unsigned> quantityCount; 116 int inputPosition; 117 unsigned frameLocation; 118 119 PatternTerm(UChar ch) 120 : type(PatternTerm::TypePatternCharacter) 121 , m_capture(false) 122 , m_invert(false) 123 { 124 patternCharacter = ch; 125 quantityType = QuantifierFixedCount; 126 quantityCount = 1; 127 } 128 129 PatternTerm(CharacterClass* charClass, bool invert) 130 : type(PatternTerm::TypeCharacterClass) 131 , m_capture(false) 132 , m_invert(invert) 133 { 134 characterClass = charClass; 135 quantityType = QuantifierFixedCount; 136 quantityCount = 1; 137 } 138 139 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) 140 : type(type) 141 , m_capture(capture) 142 , m_invert(invert) 143 { 144 parentheses.disjunction = disjunction; 145 parentheses.subpatternId = subpatternId; 146 parentheses.isCopy = false; 147 parentheses.isTerminal = false; 148 quantityType = QuantifierFixedCount; 149 quantityCount = 1; 150 } 151 152 PatternTerm(Type type, bool invert = false) 153 : type(type) 154 , m_capture(false) 155 , m_invert(invert) 156 { 157 quantityType = QuantifierFixedCount; 158 quantityCount = 1; 159 } 160 161 PatternTerm(unsigned spatternId) 162 : type(TypeBackReference) 163 , m_capture(false) 164 , m_invert(false) 165 { 166 backReferenceSubpatternId = spatternId; 167 quantityType = QuantifierFixedCount; 168 quantityCount = 1; 169 } 170 171 PatternTerm(bool bolAnchor, bool eolAnchor) 172 : type(TypeDotStarEnclosure) 173 , m_capture(false) 174 , m_invert(false) 175 { 176 anchors.bolAnchor = bolAnchor; 177 anchors.eolAnchor = eolAnchor; 178 quantityType = QuantifierFixedCount; 179 quantityCount = 1; 180 } 181 182 static PatternTerm ForwardReference() 183 { 184 return PatternTerm(TypeForwardReference); 185 } 186 187 static PatternTerm BOL() 188 { 189 return PatternTerm(TypeAssertionBOL); 190 } 191 192 static PatternTerm EOL() 193 { 194 return PatternTerm(TypeAssertionEOL); 195 } 196 197 static PatternTerm WordBoundary(bool invert) 198 { 199 return PatternTerm(TypeAssertionWordBoundary, invert); 200 } 201 202 bool invert() 203 { 204 return m_invert; 205 } 206 207 bool capture() 208 { 209 return m_capture; 210 } 211 212 void quantify(unsigned count, QuantifierType type) 213 { 214 quantityCount = count; 215 quantityType = type; 216 } 217}; 218 219struct PatternAlternative { 220 WTF_MAKE_FAST_ALLOCATED; 221public: 222 PatternAlternative(PatternDisjunction* disjunction) 223 : m_parent(disjunction) 224 , m_onceThrough(false) 225 , m_hasFixedSize(false) 226 , m_startsWithBOL(false) 227 , m_containsBOL(false) 228 { 229 } 230 231 PatternTerm& lastTerm() 232 { 233 ASSERT(m_terms.size()); 234 return m_terms[m_terms.size() - 1]; 235 } 236 237 void removeLastTerm() 238 { 239 ASSERT(m_terms.size()); 240 m_terms.shrink(m_terms.size() - 1); 241 } 242 243 void setOnceThrough() 244 { 245 m_onceThrough = true; 246 } 247 248 bool onceThrough() 249 { 250 return m_onceThrough; 251 } 252 253 Vector<PatternTerm> m_terms; 254 PatternDisjunction* m_parent; 255 unsigned m_minimumSize; 256 bool m_onceThrough : 1; 257 bool m_hasFixedSize : 1; 258 bool m_startsWithBOL : 1; 259 bool m_containsBOL : 1; 260}; 261 262struct PatternDisjunction { 263 WTF_MAKE_FAST_ALLOCATED; 264public: 265 PatternDisjunction(PatternAlternative* parent = 0) 266 : m_parent(parent) 267 , m_hasFixedSize(false) 268 { 269 } 270 271 PatternAlternative* addNewAlternative() 272 { 273 PatternAlternative* alternative = new PatternAlternative(this); 274 m_alternatives.append(adoptPtr(alternative)); 275 return alternative; 276 } 277 278 Vector<OwnPtr<PatternAlternative> > m_alternatives; 279 PatternAlternative* m_parent; 280 unsigned m_minimumSize; 281 unsigned m_callFrameSize; 282 bool m_hasFixedSize; 283}; 284 285// You probably don't want to be calling these functions directly 286// (please to be calling newlineCharacterClass() et al on your 287// friendly neighborhood YarrPattern instance to get nicely 288// cached copies). 289CharacterClass* newlineCreate(); 290CharacterClass* digitsCreate(); 291CharacterClass* spacesCreate(); 292CharacterClass* wordcharCreate(); 293CharacterClass* nondigitsCreate(); 294CharacterClass* nonspacesCreate(); 295CharacterClass* nonwordcharCreate(); 296 297struct TermChain { 298 TermChain(PatternTerm term) 299 : term(term) 300 {} 301 302 PatternTerm term; 303 Vector<TermChain> hotTerms; 304}; 305 306struct YarrPattern { 307 JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); 308 309 void reset() 310 { 311 m_numSubpatterns = 0; 312 m_maxBackReference = 0; 313 314 m_containsBackreferences = false; 315 m_containsBOL = false; 316 317 newlineCached = 0; 318 digitsCached = 0; 319 spacesCached = 0; 320 wordcharCached = 0; 321 nondigitsCached = 0; 322 nonspacesCached = 0; 323 nonwordcharCached = 0; 324 325 m_disjunctions.clear(); 326 m_userCharacterClasses.clear(); 327 } 328 329 bool containsIllegalBackReference() 330 { 331 return m_maxBackReference > m_numSubpatterns; 332 } 333 334 CharacterClass* newlineCharacterClass() 335 { 336 if (!newlineCached) 337 m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate())); 338 return newlineCached; 339 } 340 CharacterClass* digitsCharacterClass() 341 { 342 if (!digitsCached) 343 m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate())); 344 return digitsCached; 345 } 346 CharacterClass* spacesCharacterClass() 347 { 348 if (!spacesCached) 349 m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate())); 350 return spacesCached; 351 } 352 CharacterClass* wordcharCharacterClass() 353 { 354 if (!wordcharCached) 355 m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate())); 356 return wordcharCached; 357 } 358 CharacterClass* nondigitsCharacterClass() 359 { 360 if (!nondigitsCached) 361 m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate())); 362 return nondigitsCached; 363 } 364 CharacterClass* nonspacesCharacterClass() 365 { 366 if (!nonspacesCached) 367 m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate())); 368 return nonspacesCached; 369 } 370 CharacterClass* nonwordcharCharacterClass() 371 { 372 if (!nonwordcharCached) 373 m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate())); 374 return nonwordcharCached; 375 } 376 377 bool m_ignoreCase : 1; 378 bool m_multiline : 1; 379 bool m_containsBackreferences : 1; 380 bool m_containsBOL : 1; 381 unsigned m_numSubpatterns; 382 unsigned m_maxBackReference; 383 PatternDisjunction* m_body; 384 Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions; 385 Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; 386 387private: 388 const char* compile(const String& patternString); 389 390 CharacterClass* newlineCached; 391 CharacterClass* digitsCached; 392 CharacterClass* spacesCached; 393 CharacterClass* wordcharCached; 394 CharacterClass* nondigitsCached; 395 CharacterClass* nonspacesCached; 396 CharacterClass* nonwordcharCached; 397}; 398 399} } // namespace JSC::Yarr 400 401#endif // YarrPattern_h 402