1/*
2 * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef YarrInterpreter_h
27#define YarrInterpreter_h
28
29#include "YarrPattern.h"
30#include <wtf/PassOwnPtr.h>
31
32namespace WTF {
33class BumpPointerAllocator;
34}
35using WTF::BumpPointerAllocator;
36
37namespace JSC { namespace Yarr {
38
39class ByteDisjunction;
40
41struct ByteTerm {
42    enum Type {
43        TypeBodyAlternativeBegin,
44        TypeBodyAlternativeDisjunction,
45        TypeBodyAlternativeEnd,
46        TypeAlternativeBegin,
47        TypeAlternativeDisjunction,
48        TypeAlternativeEnd,
49        TypeSubpatternBegin,
50        TypeSubpatternEnd,
51        TypeAssertionBOL,
52        TypeAssertionEOL,
53        TypeAssertionWordBoundary,
54        TypePatternCharacterOnce,
55        TypePatternCharacterFixed,
56        TypePatternCharacterGreedy,
57        TypePatternCharacterNonGreedy,
58        TypePatternCasedCharacterOnce,
59        TypePatternCasedCharacterFixed,
60        TypePatternCasedCharacterGreedy,
61        TypePatternCasedCharacterNonGreedy,
62        TypeCharacterClass,
63        TypeBackReference,
64        TypeParenthesesSubpattern,
65        TypeParenthesesSubpatternOnceBegin,
66        TypeParenthesesSubpatternOnceEnd,
67        TypeParenthesesSubpatternTerminalBegin,
68        TypeParenthesesSubpatternTerminalEnd,
69        TypeParentheticalAssertionBegin,
70        TypeParentheticalAssertionEnd,
71        TypeCheckInput,
72        TypeUncheckInput,
73        TypeDotStarEnclosure,
74    } type;
75    union {
76        struct {
77            union {
78                UChar patternCharacter;
79                struct {
80                    UChar lo;
81                    UChar hi;
82                } casedCharacter;
83                CharacterClass* characterClass;
84                unsigned subpatternId;
85            };
86            union {
87                ByteDisjunction* parenthesesDisjunction;
88                unsigned parenthesesWidth;
89            };
90            QuantifierType quantityType;
91            unsigned quantityCount;
92        } atom;
93        struct {
94            int next;
95            int end;
96            bool onceThrough;
97        } alternative;
98        struct {
99            bool m_bol : 1;
100            bool m_eol : 1;
101        } anchors;
102        unsigned checkInputCount;
103    };
104    unsigned frameLocation;
105    bool m_capture : 1;
106    bool m_invert : 1;
107    unsigned inputPosition;
108
109    ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
110        : frameLocation(frameLocation)
111        , m_capture(false)
112        , m_invert(false)
113    {
114        switch (quantityType) {
115        case QuantifierFixedCount:
116            type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
117            break;
118        case QuantifierGreedy:
119            type = ByteTerm::TypePatternCharacterGreedy;
120            break;
121        case QuantifierNonGreedy:
122            type = ByteTerm::TypePatternCharacterNonGreedy;
123            break;
124        }
125
126        atom.patternCharacter = ch;
127        atom.quantityType = quantityType;
128        atom.quantityCount = quantityCount.unsafeGet();
129        inputPosition = inputPos;
130    }
131
132    ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
133        : frameLocation(frameLocation)
134        , m_capture(false)
135        , m_invert(false)
136    {
137        switch (quantityType) {
138        case QuantifierFixedCount:
139            type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
140            break;
141        case QuantifierGreedy:
142            type = ByteTerm::TypePatternCasedCharacterGreedy;
143            break;
144        case QuantifierNonGreedy:
145            type = ByteTerm::TypePatternCasedCharacterNonGreedy;
146            break;
147        }
148
149        atom.casedCharacter.lo = lo;
150        atom.casedCharacter.hi = hi;
151        atom.quantityType = quantityType;
152        atom.quantityCount = quantityCount.unsafeGet();
153        inputPosition = inputPos;
154    }
155
156    ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
157        : type(ByteTerm::TypeCharacterClass)
158        , m_capture(false)
159        , m_invert(invert)
160    {
161        atom.characterClass = characterClass;
162        atom.quantityType = QuantifierFixedCount;
163        atom.quantityCount = 1;
164        inputPosition = inputPos;
165    }
166
167    ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
168        : type(type)
169        , m_capture(capture)
170        , m_invert(false)
171    {
172        atom.subpatternId = subpatternId;
173        atom.parenthesesDisjunction = parenthesesInfo;
174        atom.quantityType = QuantifierFixedCount;
175        atom.quantityCount = 1;
176        inputPosition = inputPos;
177    }
178
179    ByteTerm(Type type, bool invert = false)
180        : type(type)
181        , m_capture(false)
182        , m_invert(invert)
183    {
184        atom.quantityType = QuantifierFixedCount;
185        atom.quantityCount = 1;
186    }
187
188    ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
189        : type(type)
190        , m_capture(capture)
191        , m_invert(invert)
192    {
193        atom.subpatternId = subpatternId;
194        atom.quantityType = QuantifierFixedCount;
195        atom.quantityCount = 1;
196        inputPosition = inputPos;
197    }
198
199    static ByteTerm BOL(int inputPos)
200    {
201        ByteTerm term(TypeAssertionBOL);
202        term.inputPosition = inputPos;
203        return term;
204    }
205
206    static ByteTerm CheckInput(Checked<unsigned> count)
207    {
208        ByteTerm term(TypeCheckInput);
209        term.checkInputCount = count.unsafeGet();
210        return term;
211    }
212
213    static ByteTerm UncheckInput(Checked<unsigned> count)
214    {
215        ByteTerm term(TypeUncheckInput);
216        term.checkInputCount = count.unsafeGet();
217        return term;
218    }
219
220    static ByteTerm EOL(int inputPos)
221    {
222        ByteTerm term(TypeAssertionEOL);
223        term.inputPosition = inputPos;
224        return term;
225    }
226
227    static ByteTerm WordBoundary(bool invert, int inputPos)
228    {
229        ByteTerm term(TypeAssertionWordBoundary, invert);
230        term.inputPosition = inputPos;
231        return term;
232    }
233
234    static ByteTerm BackReference(unsigned subpatternId, int inputPos)
235    {
236        return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
237    }
238
239    static ByteTerm BodyAlternativeBegin(bool onceThrough)
240    {
241        ByteTerm term(TypeBodyAlternativeBegin);
242        term.alternative.next = 0;
243        term.alternative.end = 0;
244        term.alternative.onceThrough = onceThrough;
245        return term;
246    }
247
248    static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
249    {
250        ByteTerm term(TypeBodyAlternativeDisjunction);
251        term.alternative.next = 0;
252        term.alternative.end = 0;
253        term.alternative.onceThrough = onceThrough;
254        return term;
255    }
256
257    static ByteTerm BodyAlternativeEnd()
258    {
259        ByteTerm term(TypeBodyAlternativeEnd);
260        term.alternative.next = 0;
261        term.alternative.end = 0;
262        term.alternative.onceThrough = false;
263        return term;
264    }
265
266    static ByteTerm AlternativeBegin()
267    {
268        ByteTerm term(TypeAlternativeBegin);
269        term.alternative.next = 0;
270        term.alternative.end = 0;
271        term.alternative.onceThrough = false;
272        return term;
273    }
274
275    static ByteTerm AlternativeDisjunction()
276    {
277        ByteTerm term(TypeAlternativeDisjunction);
278        term.alternative.next = 0;
279        term.alternative.end = 0;
280        term.alternative.onceThrough = false;
281        return term;
282    }
283
284    static ByteTerm AlternativeEnd()
285    {
286        ByteTerm term(TypeAlternativeEnd);
287        term.alternative.next = 0;
288        term.alternative.end = 0;
289        term.alternative.onceThrough = false;
290        return term;
291    }
292
293    static ByteTerm SubpatternBegin()
294    {
295        return ByteTerm(TypeSubpatternBegin);
296    }
297
298    static ByteTerm SubpatternEnd()
299    {
300        return ByteTerm(TypeSubpatternEnd);
301    }
302
303    static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
304    {
305        ByteTerm term(TypeDotStarEnclosure);
306        term.anchors.m_bol = bolAnchor;
307        term.anchors.m_eol = eolAnchor;
308        return term;
309    }
310
311    bool invert()
312    {
313        return m_invert;
314    }
315
316    bool capture()
317    {
318        return m_capture;
319    }
320};
321
322class ByteDisjunction {
323    WTF_MAKE_FAST_ALLOCATED;
324public:
325    ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
326        : m_numSubpatterns(numSubpatterns)
327        , m_frameSize(frameSize)
328    {
329    }
330
331    Vector<ByteTerm> terms;
332    unsigned m_numSubpatterns;
333    unsigned m_frameSize;
334};
335
336struct BytecodePattern {
337    WTF_MAKE_FAST_ALLOCATED;
338public:
339    BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator)
340        : m_body(body)
341        , m_ignoreCase(pattern.m_ignoreCase)
342        , m_multiline(pattern.m_multiline)
343        , m_allocator(allocator)
344    {
345        m_body->terms.shrinkToFit();
346
347        newlineCharacterClass = pattern.newlineCharacterClass();
348        wordcharCharacterClass = pattern.wordcharCharacterClass();
349
350        m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
351        m_allParenthesesInfo.shrinkToFit();
352
353        m_userCharacterClasses.swap(pattern.m_userCharacterClasses);
354        m_userCharacterClasses.shrinkToFit();
355    }
356
357    OwnPtr<ByteDisjunction> m_body;
358    bool m_ignoreCase;
359    bool m_multiline;
360    // Each BytecodePattern is associated with a RegExp, each RegExp is associated
361    // with a VM.  Cache a pointer to out VM's m_regExpAllocator.
362    BumpPointerAllocator* m_allocator;
363
364    CharacterClass* newlineCharacterClass;
365    CharacterClass* wordcharCharacterClass;
366
367private:
368    Vector<OwnPtr<ByteDisjunction>> m_allParenthesesInfo;
369    Vector<OwnPtr<CharacterClass>> m_userCharacterClasses;
370};
371
372JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
373JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
374unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
375unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
376
377} } // namespace JSC::Yarr
378
379#endif // YarrInterpreter_h
380