1/*
2 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifndef YarrPattern_h
28#define YarrPattern_h
29
30#include <wtf/CheckedArithmetic.h>
31#include <wtf/OwnPtr.h>
32#include <wtf/PassOwnPtr.h>
33#include <wtf/RefCounted.h>
34#include <wtf/Vector.h>
35#include <wtf/text/WTFString.h>
36#include <wtf/unicode/Unicode.h>
37
38namespace JSC { namespace Yarr {
39
40struct PatternDisjunction;
41
42struct CharacterRange {
43    UChar begin;
44    UChar end;
45
46    CharacterRange(UChar begin, UChar end)
47        : begin(begin)
48        , end(end)
49    {
50    }
51};
52
53struct CharacterClass {
54    WTF_MAKE_FAST_ALLOCATED;
55public:
56    // All CharacterClass instances have to have the full set of matches and ranges,
57    // they may have an optional m_table for faster lookups (which must match the
58    // specified matches and ranges)
59    CharacterClass()
60        : m_table(0)
61    {
62    }
63    CharacterClass(const char* table, bool inverted)
64        : m_table(table)
65        , m_tableInverted(inverted)
66    {
67    }
68    Vector<UChar> m_matches;
69    Vector<CharacterRange> m_ranges;
70    Vector<UChar> m_matchesUnicode;
71    Vector<CharacterRange> m_rangesUnicode;
72
73    const char* m_table;
74    bool m_tableInverted;
75};
76
77enum QuantifierType {
78    QuantifierFixedCount,
79    QuantifierGreedy,
80    QuantifierNonGreedy,
81};
82
83struct PatternTerm {
84    enum Type {
85        TypeAssertionBOL,
86        TypeAssertionEOL,
87        TypeAssertionWordBoundary,
88        TypePatternCharacter,
89        TypeCharacterClass,
90        TypeBackReference,
91        TypeForwardReference,
92        TypeParenthesesSubpattern,
93        TypeParentheticalAssertion,
94        TypeDotStarEnclosure,
95    } type;
96    bool m_capture :1;
97    bool m_invert :1;
98    union {
99        UChar patternCharacter;
100        CharacterClass* characterClass;
101        unsigned backReferenceSubpatternId;
102        struct {
103            PatternDisjunction* disjunction;
104            unsigned subpatternId;
105            unsigned lastSubpatternId;
106            bool isCopy;
107            bool isTerminal;
108        } parentheses;
109        struct {
110            bool bolAnchor : 1;
111            bool eolAnchor : 1;
112        } anchors;
113    };
114    QuantifierType quantityType;
115    Checked<unsigned> quantityCount;
116    int inputPosition;
117    unsigned frameLocation;
118
119    PatternTerm(UChar ch)
120        : type(PatternTerm::TypePatternCharacter)
121        , m_capture(false)
122        , m_invert(false)
123    {
124        patternCharacter = ch;
125        quantityType = QuantifierFixedCount;
126        quantityCount = 1;
127    }
128
129    PatternTerm(CharacterClass* charClass, bool invert)
130        : type(PatternTerm::TypeCharacterClass)
131        , m_capture(false)
132        , m_invert(invert)
133    {
134        characterClass = charClass;
135        quantityType = QuantifierFixedCount;
136        quantityCount = 1;
137    }
138
139    PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
140        : type(type)
141        , m_capture(capture)
142        , m_invert(invert)
143    {
144        parentheses.disjunction = disjunction;
145        parentheses.subpatternId = subpatternId;
146        parentheses.isCopy = false;
147        parentheses.isTerminal = false;
148        quantityType = QuantifierFixedCount;
149        quantityCount = 1;
150    }
151
152    PatternTerm(Type type, bool invert = false)
153        : type(type)
154        , m_capture(false)
155        , m_invert(invert)
156    {
157        quantityType = QuantifierFixedCount;
158        quantityCount = 1;
159    }
160
161    PatternTerm(unsigned spatternId)
162        : type(TypeBackReference)
163        , m_capture(false)
164        , m_invert(false)
165    {
166        backReferenceSubpatternId = spatternId;
167        quantityType = QuantifierFixedCount;
168        quantityCount = 1;
169    }
170
171    PatternTerm(bool bolAnchor, bool eolAnchor)
172        : type(TypeDotStarEnclosure)
173        , m_capture(false)
174        , m_invert(false)
175    {
176        anchors.bolAnchor = bolAnchor;
177        anchors.eolAnchor = eolAnchor;
178        quantityType = QuantifierFixedCount;
179        quantityCount = 1;
180    }
181
182    static PatternTerm ForwardReference()
183    {
184        return PatternTerm(TypeForwardReference);
185    }
186
187    static PatternTerm BOL()
188    {
189        return PatternTerm(TypeAssertionBOL);
190    }
191
192    static PatternTerm EOL()
193    {
194        return PatternTerm(TypeAssertionEOL);
195    }
196
197    static PatternTerm WordBoundary(bool invert)
198    {
199        return PatternTerm(TypeAssertionWordBoundary, invert);
200    }
201
202    bool invert()
203    {
204        return m_invert;
205    }
206
207    bool capture()
208    {
209        return m_capture;
210    }
211
212    void quantify(unsigned count, QuantifierType type)
213    {
214        quantityCount = count;
215        quantityType = type;
216    }
217};
218
219struct PatternAlternative {
220    WTF_MAKE_FAST_ALLOCATED;
221public:
222    PatternAlternative(PatternDisjunction* disjunction)
223        : m_parent(disjunction)
224        , m_onceThrough(false)
225        , m_hasFixedSize(false)
226        , m_startsWithBOL(false)
227        , m_containsBOL(false)
228    {
229    }
230
231    PatternTerm& lastTerm()
232    {
233        ASSERT(m_terms.size());
234        return m_terms[m_terms.size() - 1];
235    }
236
237    void removeLastTerm()
238    {
239        ASSERT(m_terms.size());
240        m_terms.shrink(m_terms.size() - 1);
241    }
242
243    void setOnceThrough()
244    {
245        m_onceThrough = true;
246    }
247
248    bool onceThrough()
249    {
250        return m_onceThrough;
251    }
252
253    Vector<PatternTerm> m_terms;
254    PatternDisjunction* m_parent;
255    unsigned m_minimumSize;
256    bool m_onceThrough : 1;
257    bool m_hasFixedSize : 1;
258    bool m_startsWithBOL : 1;
259    bool m_containsBOL : 1;
260};
261
262struct PatternDisjunction {
263    WTF_MAKE_FAST_ALLOCATED;
264public:
265    PatternDisjunction(PatternAlternative* parent = 0)
266        : m_parent(parent)
267        , m_hasFixedSize(false)
268    {
269    }
270
271    PatternAlternative* addNewAlternative()
272    {
273        PatternAlternative* alternative = new PatternAlternative(this);
274        m_alternatives.append(adoptPtr(alternative));
275        return alternative;
276    }
277
278    Vector<OwnPtr<PatternAlternative> > m_alternatives;
279    PatternAlternative* m_parent;
280    unsigned m_minimumSize;
281    unsigned m_callFrameSize;
282    bool m_hasFixedSize;
283};
284
285// You probably don't want to be calling these functions directly
286// (please to be calling newlineCharacterClass() et al on your
287// friendly neighborhood YarrPattern instance to get nicely
288// cached copies).
289CharacterClass* newlineCreate();
290CharacterClass* digitsCreate();
291CharacterClass* spacesCreate();
292CharacterClass* wordcharCreate();
293CharacterClass* nondigitsCreate();
294CharacterClass* nonspacesCreate();
295CharacterClass* nonwordcharCreate();
296
297struct TermChain {
298    TermChain(PatternTerm term)
299        : term(term)
300    {}
301
302    PatternTerm term;
303    Vector<TermChain> hotTerms;
304};
305
306struct YarrPattern {
307    JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
308
309    void reset()
310    {
311        m_numSubpatterns = 0;
312        m_maxBackReference = 0;
313
314        m_containsBackreferences = false;
315        m_containsBOL = false;
316
317        newlineCached = 0;
318        digitsCached = 0;
319        spacesCached = 0;
320        wordcharCached = 0;
321        nondigitsCached = 0;
322        nonspacesCached = 0;
323        nonwordcharCached = 0;
324
325        m_disjunctions.clear();
326        m_userCharacterClasses.clear();
327    }
328
329    bool containsIllegalBackReference()
330    {
331        return m_maxBackReference > m_numSubpatterns;
332    }
333
334    CharacterClass* newlineCharacterClass()
335    {
336        if (!newlineCached)
337            m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate()));
338        return newlineCached;
339    }
340    CharacterClass* digitsCharacterClass()
341    {
342        if (!digitsCached)
343            m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate()));
344        return digitsCached;
345    }
346    CharacterClass* spacesCharacterClass()
347    {
348        if (!spacesCached)
349            m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate()));
350        return spacesCached;
351    }
352    CharacterClass* wordcharCharacterClass()
353    {
354        if (!wordcharCached)
355            m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate()));
356        return wordcharCached;
357    }
358    CharacterClass* nondigitsCharacterClass()
359    {
360        if (!nondigitsCached)
361            m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate()));
362        return nondigitsCached;
363    }
364    CharacterClass* nonspacesCharacterClass()
365    {
366        if (!nonspacesCached)
367            m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate()));
368        return nonspacesCached;
369    }
370    CharacterClass* nonwordcharCharacterClass()
371    {
372        if (!nonwordcharCached)
373            m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate()));
374        return nonwordcharCached;
375    }
376
377    bool m_ignoreCase : 1;
378    bool m_multiline : 1;
379    bool m_containsBackreferences : 1;
380    bool m_containsBOL : 1;
381    unsigned m_numSubpatterns;
382    unsigned m_maxBackReference;
383    PatternDisjunction* m_body;
384    Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions;
385    Vector<OwnPtr<CharacterClass> > m_userCharacterClasses;
386
387private:
388    const char* compile(const String& patternString);
389
390    CharacterClass* newlineCached;
391    CharacterClass* digitsCached;
392    CharacterClass* spacesCached;
393    CharacterClass* wordcharCached;
394    CharacterClass* nondigitsCached;
395    CharacterClass* nonspacesCached;
396    CharacterClass* nonwordcharCached;
397};
398
399} } // namespace JSC::Yarr
400
401#endif // YarrPattern_h
402