1/*
2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifndef YarrPattern_h
28#define YarrPattern_h
29
30#include <wtf/CheckedArithmetic.h>
31#include <wtf/OwnPtr.h>
32#include <wtf/PassOwnPtr.h>
33#include <wtf/RefCounted.h>
34#include <wtf/Vector.h>
35#include <wtf/text/WTFString.h>
36
37namespace JSC { namespace Yarr {
38
39struct PatternDisjunction;
40
41struct CharacterRange {
42    UChar begin;
43    UChar end;
44
45    CharacterRange(UChar begin, UChar end)
46        : begin(begin)
47        , end(end)
48    {
49    }
50};
51
52struct CharacterClass {
53    WTF_MAKE_FAST_ALLOCATED;
54public:
55    // All CharacterClass instances have to have the full set of matches and ranges,
56    // they may have an optional m_table for faster lookups (which must match the
57    // specified matches and ranges)
58    CharacterClass()
59        : m_table(0)
60    {
61    }
62    CharacterClass(const char* table, bool inverted)
63        : m_table(table)
64        , m_tableInverted(inverted)
65    {
66    }
67    Vector<UChar> m_matches;
68    Vector<CharacterRange> m_ranges;
69    Vector<UChar> m_matchesUnicode;
70    Vector<CharacterRange> m_rangesUnicode;
71
72    const char* m_table;
73    bool m_tableInverted;
74};
75
76enum QuantifierType {
77    QuantifierFixedCount,
78    QuantifierGreedy,
79    QuantifierNonGreedy,
80};
81
82struct PatternTerm {
83    enum Type {
84        TypeAssertionBOL,
85        TypeAssertionEOL,
86        TypeAssertionWordBoundary,
87        TypePatternCharacter,
88        TypeCharacterClass,
89        TypeBackReference,
90        TypeForwardReference,
91        TypeParenthesesSubpattern,
92        TypeParentheticalAssertion,
93        TypeDotStarEnclosure,
94    } type;
95    bool m_capture :1;
96    bool m_invert :1;
97    union {
98        UChar patternCharacter;
99        CharacterClass* characterClass;
100        unsigned backReferenceSubpatternId;
101        struct {
102            PatternDisjunction* disjunction;
103            unsigned subpatternId;
104            unsigned lastSubpatternId;
105            bool isCopy;
106            bool isTerminal;
107        } parentheses;
108        struct {
109            bool bolAnchor : 1;
110            bool eolAnchor : 1;
111        } anchors;
112    };
113    QuantifierType quantityType;
114    Checked<unsigned> quantityCount;
115    int inputPosition;
116    unsigned frameLocation;
117
118    PatternTerm(UChar ch)
119        : type(PatternTerm::TypePatternCharacter)
120        , m_capture(false)
121        , m_invert(false)
122    {
123        patternCharacter = ch;
124        quantityType = QuantifierFixedCount;
125        quantityCount = 1;
126    }
127
128    PatternTerm(CharacterClass* charClass, bool invert)
129        : type(PatternTerm::TypeCharacterClass)
130        , m_capture(false)
131        , m_invert(invert)
132    {
133        characterClass = charClass;
134        quantityType = QuantifierFixedCount;
135        quantityCount = 1;
136    }
137
138    PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
139        : type(type)
140        , m_capture(capture)
141        , m_invert(invert)
142    {
143        parentheses.disjunction = disjunction;
144        parentheses.subpatternId = subpatternId;
145        parentheses.isCopy = false;
146        parentheses.isTerminal = false;
147        quantityType = QuantifierFixedCount;
148        quantityCount = 1;
149    }
150
151    PatternTerm(Type type, bool invert = false)
152        : type(type)
153        , m_capture(false)
154        , m_invert(invert)
155    {
156        quantityType = QuantifierFixedCount;
157        quantityCount = 1;
158    }
159
160    PatternTerm(unsigned spatternId)
161        : type(TypeBackReference)
162        , m_capture(false)
163        , m_invert(false)
164    {
165        backReferenceSubpatternId = spatternId;
166        quantityType = QuantifierFixedCount;
167        quantityCount = 1;
168    }
169
170    PatternTerm(bool bolAnchor, bool eolAnchor)
171        : type(TypeDotStarEnclosure)
172        , m_capture(false)
173        , m_invert(false)
174    {
175        anchors.bolAnchor = bolAnchor;
176        anchors.eolAnchor = eolAnchor;
177        quantityType = QuantifierFixedCount;
178        quantityCount = 1;
179    }
180
181    static PatternTerm ForwardReference()
182    {
183        return PatternTerm(TypeForwardReference);
184    }
185
186    static PatternTerm BOL()
187    {
188        return PatternTerm(TypeAssertionBOL);
189    }
190
191    static PatternTerm EOL()
192    {
193        return PatternTerm(TypeAssertionEOL);
194    }
195
196    static PatternTerm WordBoundary(bool invert)
197    {
198        return PatternTerm(TypeAssertionWordBoundary, invert);
199    }
200
201    bool invert()
202    {
203        return m_invert;
204    }
205
206    bool capture()
207    {
208        return m_capture;
209    }
210
211    void quantify(unsigned count, QuantifierType type)
212    {
213        quantityCount = count;
214        quantityType = type;
215    }
216};
217
218struct PatternAlternative {
219    WTF_MAKE_FAST_ALLOCATED;
220public:
221    PatternAlternative(PatternDisjunction* disjunction)
222        : m_parent(disjunction)
223        , m_onceThrough(false)
224        , m_hasFixedSize(false)
225        , m_startsWithBOL(false)
226        , m_containsBOL(false)
227    {
228    }
229
230    PatternTerm& lastTerm()
231    {
232        ASSERT(m_terms.size());
233        return m_terms[m_terms.size() - 1];
234    }
235
236    void removeLastTerm()
237    {
238        ASSERT(m_terms.size());
239        m_terms.shrink(m_terms.size() - 1);
240    }
241
242    void setOnceThrough()
243    {
244        m_onceThrough = true;
245    }
246
247    bool onceThrough()
248    {
249        return m_onceThrough;
250    }
251
252    Vector<PatternTerm> m_terms;
253    PatternDisjunction* m_parent;
254    unsigned m_minimumSize;
255    bool m_onceThrough : 1;
256    bool m_hasFixedSize : 1;
257    bool m_startsWithBOL : 1;
258    bool m_containsBOL : 1;
259};
260
261struct PatternDisjunction {
262    WTF_MAKE_FAST_ALLOCATED;
263public:
264    PatternDisjunction(PatternAlternative* parent = 0)
265        : m_parent(parent)
266        , m_hasFixedSize(false)
267    {
268    }
269
270    PatternAlternative* addNewAlternative()
271    {
272        PatternAlternative* alternative = new PatternAlternative(this);
273        m_alternatives.append(adoptPtr(alternative));
274        return alternative;
275    }
276
277    Vector<OwnPtr<PatternAlternative>> m_alternatives;
278    PatternAlternative* m_parent;
279    unsigned m_minimumSize;
280    unsigned m_callFrameSize;
281    bool m_hasFixedSize;
282};
283
284// You probably don't want to be calling these functions directly
285// (please to be calling newlineCharacterClass() et al on your
286// friendly neighborhood YarrPattern instance to get nicely
287// cached copies).
288CharacterClass* newlineCreate();
289CharacterClass* digitsCreate();
290CharacterClass* spacesCreate();
291CharacterClass* wordcharCreate();
292CharacterClass* nondigitsCreate();
293CharacterClass* nonspacesCreate();
294CharacterClass* nonwordcharCreate();
295
296struct TermChain {
297    TermChain(PatternTerm term)
298        : term(term)
299    {}
300
301    PatternTerm term;
302    Vector<TermChain> hotTerms;
303};
304
305struct YarrPattern {
306    JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
307
308    void reset()
309    {
310        m_numSubpatterns = 0;
311        m_maxBackReference = 0;
312
313        m_containsBackreferences = false;
314        m_containsBOL = false;
315        m_containsUnsignedLengthPattern = false;
316
317        newlineCached = 0;
318        digitsCached = 0;
319        spacesCached = 0;
320        wordcharCached = 0;
321        nondigitsCached = 0;
322        nonspacesCached = 0;
323        nonwordcharCached = 0;
324
325        m_disjunctions.clear();
326        m_userCharacterClasses.clear();
327    }
328
329    bool containsIllegalBackReference()
330    {
331        return m_maxBackReference > m_numSubpatterns;
332    }
333
334    bool containsUnsignedLengthPattern()
335    {
336        return m_containsUnsignedLengthPattern;
337    }
338
339    CharacterClass* newlineCharacterClass()
340    {
341        if (!newlineCached)
342            m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate()));
343        return newlineCached;
344    }
345    CharacterClass* digitsCharacterClass()
346    {
347        if (!digitsCached)
348            m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate()));
349        return digitsCached;
350    }
351    CharacterClass* spacesCharacterClass()
352    {
353        if (!spacesCached)
354            m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate()));
355        return spacesCached;
356    }
357    CharacterClass* wordcharCharacterClass()
358    {
359        if (!wordcharCached)
360            m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate()));
361        return wordcharCached;
362    }
363    CharacterClass* nondigitsCharacterClass()
364    {
365        if (!nondigitsCached)
366            m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate()));
367        return nondigitsCached;
368    }
369    CharacterClass* nonspacesCharacterClass()
370    {
371        if (!nonspacesCached)
372            m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate()));
373        return nonspacesCached;
374    }
375    CharacterClass* nonwordcharCharacterClass()
376    {
377        if (!nonwordcharCached)
378            m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate()));
379        return nonwordcharCached;
380    }
381
382    bool m_ignoreCase : 1;
383    bool m_multiline : 1;
384    bool m_containsBackreferences : 1;
385    bool m_containsBOL : 1;
386    bool m_containsUnsignedLengthPattern : 1;
387    unsigned m_numSubpatterns;
388    unsigned m_maxBackReference;
389    PatternDisjunction* m_body;
390    Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions;
391    Vector<OwnPtr<CharacterClass>> m_userCharacterClasses;
392
393private:
394    const char* compile(const String& patternString);
395
396    CharacterClass* newlineCached;
397    CharacterClass* digitsCached;
398    CharacterClass* spacesCached;
399    CharacterClass* wordcharCached;
400    CharacterClass* nondigitsCached;
401    CharacterClass* nonspacesCached;
402    CharacterClass* nonwordcharCached;
403};
404
405} } // namespace JSC::Yarr
406
407#endif // YarrPattern_h
408