1/*
2 *  Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 *  Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 *  Copyright (C) 2009 Torch Mobile, Inc.
5 *  Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 *  This library is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU Lesser General Public
9 *  License as published by the Free Software Foundation; either
10 *  version 2 of the License, or (at your option) any later version.
11 *
12 *  This library is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 *  Lesser General Public License for more details.
16 *
17 *  You should have received a copy of the GNU Lesser General Public
18 *  License along with this library; if not, write to the Free Software
19 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 *
21 */
22
23#include "config.h"
24#include "RegExp.h"
25
26#include "Lexer.h"
27#include "Operations.h"
28#include "RegExpCache.h"
29#include "Yarr.h"
30#include "YarrJIT.h"
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <wtf/Assertions.h>
35#include <wtf/OwnArrayPtr.h>
36
37#define REGEXP_FUNC_TEST_DATA_GEN 0
38
39namespace JSC {
40
41const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) };
42
43RegExpFlags regExpFlags(const String& string)
44{
45    RegExpFlags flags = NoFlags;
46
47    for (unsigned i = 0; i < string.length(); ++i) {
48        switch (string[i]) {
49        case 'g':
50            if (flags & FlagGlobal)
51                return InvalidFlags;
52            flags = static_cast<RegExpFlags>(flags | FlagGlobal);
53            break;
54
55        case 'i':
56            if (flags & FlagIgnoreCase)
57                return InvalidFlags;
58            flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
59            break;
60
61        case 'm':
62            if (flags & FlagMultiline)
63                return InvalidFlags;
64            flags = static_cast<RegExpFlags>(flags | FlagMultiline);
65            break;
66
67        default:
68            return InvalidFlags;
69        }
70    }
71
72    return flags;
73}
74
75#if REGEXP_FUNC_TEST_DATA_GEN
76class RegExpFunctionalTestCollector {
77    // This class is not thread safe.
78protected:
79    static const char* const s_fileName;
80
81public:
82    static RegExpFunctionalTestCollector* get();
83
84    ~RegExpFunctionalTestCollector();
85
86    void outputOneTest(RegExp*, String, int, int*, int);
87    void clearRegExp(RegExp* regExp)
88    {
89        if (regExp == m_lastRegExp)
90            m_lastRegExp = 0;
91    }
92
93private:
94    RegExpFunctionalTestCollector();
95
96    void outputEscapedString(const String&, bool escapeSlash = false);
97
98    static RegExpFunctionalTestCollector* s_instance;
99    FILE* m_file;
100    RegExp* m_lastRegExp;
101};
102
103const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
104RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
105
106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
107{
108    if (!s_instance)
109        s_instance = new RegExpFunctionalTestCollector();
110
111    return s_instance;
112}
113
114void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result)
115{
116    if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
117        m_lastRegExp = regExp;
118        fputc('/', m_file);
119        outputEscapedString(regExp->pattern(), true);
120        fputc('/', m_file);
121        if (regExp->global())
122            fputc('g', m_file);
123        if (regExp->ignoreCase())
124            fputc('i', m_file);
125        if (regExp->multiline())
126            fputc('m', m_file);
127        fprintf(m_file, "\n");
128    }
129
130    fprintf(m_file, " \"");
131    outputEscapedString(s);
132    fprintf(m_file, "\", %d, %d, (", startOffset, result);
133    for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
134        int subpatternBegin = ovector[i * 2];
135        int subpatternEnd = ovector[i * 2 + 1];
136        if (subpatternBegin == -1)
137            subpatternEnd = -1;
138        fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
139        if (i < regExp->numSubpatterns())
140            fputs(", ", m_file);
141    }
142
143    fprintf(m_file, ")\n");
144    fflush(m_file);
145}
146
147RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
148{
149    m_file = fopen(s_fileName, "r+");
150    if  (!m_file)
151        m_file = fopen(s_fileName, "w+");
152
153    fseek(m_file, 0L, SEEK_END);
154}
155
156RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
157{
158    fclose(m_file);
159    s_instance = 0;
160}
161
162void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
163{
164    int len = s.length();
165
166    for (int i = 0; i < len; ++i) {
167        UChar c = s[i];
168
169        switch (c) {
170        case '\0':
171            fputs("\\0", m_file);
172            break;
173        case '\a':
174            fputs("\\a", m_file);
175            break;
176        case '\b':
177            fputs("\\b", m_file);
178            break;
179        case '\f':
180            fputs("\\f", m_file);
181            break;
182        case '\n':
183            fputs("\\n", m_file);
184            break;
185        case '\r':
186            fputs("\\r", m_file);
187            break;
188        case '\t':
189            fputs("\\t", m_file);
190            break;
191        case '\v':
192            fputs("\\v", m_file);
193            break;
194        case '/':
195            if (escapeSlash)
196                fputs("\\/", m_file);
197            else
198                fputs("/", m_file);
199            break;
200        case '\"':
201            fputs("\\\"", m_file);
202            break;
203        case '\\':
204            fputs("\\\\", m_file);
205            break;
206        case '\?':
207            fputs("\?", m_file);
208            break;
209        default:
210            if (c > 0x7f)
211                fprintf(m_file, "\\u%04x", c);
212            else
213                fputc(c, m_file);
214            break;
215        }
216    }
217}
218#endif
219
220RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
221    : JSCell(vm, vm.regExpStructure.get())
222    , m_state(NotCompiled)
223    , m_patternString(patternString)
224    , m_flags(flags)
225    , m_constructionError(0)
226    , m_numSubpatterns(0)
227#if ENABLE(REGEXP_TRACING)
228    , m_rtMatchCallCount(0)
229    , m_rtMatchFoundCount(0)
230#endif
231{
232}
233
234void RegExp::finishCreation(VM& vm)
235{
236    Base::finishCreation(vm);
237    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
238    if (m_constructionError)
239        m_state = ParseError;
240    else
241        m_numSubpatterns = pattern.m_numSubpatterns;
242}
243
244void RegExp::destroy(JSCell* cell)
245{
246    RegExp* thisObject = static_cast<RegExp*>(cell);
247#if REGEXP_FUNC_TEST_DATA_GEN
248    RegExpFunctionalTestCollector::get()->clearRegExp(this);
249#endif
250    thisObject->RegExp::~RegExp();
251}
252
253RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
254{
255    RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
256    regExp->finishCreation(vm);
257    return regExp;
258}
259
260RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
261{
262    return vm.regExpCache()->lookupOrCreate(patternString, flags);
263}
264
265void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
266{
267    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
268    if (m_constructionError) {
269        RELEASE_ASSERT_NOT_REACHED();
270        m_state = ParseError;
271        return;
272    }
273    ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
274
275    if (!hasCode()) {
276        ASSERT(m_state == NotCompiled);
277        vm->regExpCache()->addToStrongCache(this);
278        m_state = ByteCode;
279    }
280
281#if ENABLE(YARR_JIT)
282    if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
283        Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
284#if ENABLE(YARR_JIT_DEBUG)
285        if (!m_regExpJITCode.isFallBack())
286            m_state = JITCode;
287        else
288            m_state = ByteCode;
289#else
290        if (!m_regExpJITCode.isFallBack()) {
291            m_state = JITCode;
292            return;
293        }
294#endif
295    }
296#else
297    UNUSED_PARAM(charSize);
298#endif
299
300    m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
301}
302
303void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
304{
305    if (hasCode()) {
306#if ENABLE(YARR_JIT)
307        if (m_state != JITCode)
308            return;
309        if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
310            return;
311        if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
312            return;
313#else
314        return;
315#endif
316    }
317
318    compile(&vm, charSize);
319}
320
321int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
322{
323#if ENABLE(REGEXP_TRACING)
324    m_rtMatchCallCount++;
325#endif
326
327    ASSERT(m_state != ParseError);
328    compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
329
330    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
331    ovector.resize(offsetVectorSize);
332    int* offsetVector = ovector.data();
333
334    int result;
335#if ENABLE(YARR_JIT)
336    if (m_state == JITCode) {
337        if (s.is8Bit())
338            result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
339        else
340            result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
341#if ENABLE(YARR_JIT_DEBUG)
342        matchCompareWithInterpreter(s, startOffset, offsetVector, result);
343#endif
344    } else
345#endif
346        result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
347
348    // FIXME: The YARR engine should handle unsigned or size_t length matches.
349    // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
350    // The offset vector handling needs to change as well.
351    // Right now we convert a match where the offsets overflowed into match failure.
352    // There are two places in WebCore that call the interpreter directly that need to
353    // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp
354    // and inspector/ContentSearchUtils.cpp.
355    if (s.length() > INT_MAX) {
356        bool overflowed = false;
357
358        if (result < -1)
359            overflowed = true;
360
361        for (unsigned i = 0; i <= m_numSubpatterns; i++) {
362            if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
363                overflowed = true;
364                offsetVector[i*2] = -1;
365                offsetVector[i*2+1] = -1;
366            }
367        }
368
369        if (overflowed)
370            result = -1;
371    }
372
373    ASSERT(result >= -1);
374
375#if REGEXP_FUNC_TEST_DATA_GEN
376    RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
377#endif
378
379#if ENABLE(REGEXP_TRACING)
380    if (result != -1)
381        m_rtMatchFoundCount++;
382#endif
383
384    return result;
385}
386
387void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
388{
389    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
390    if (m_constructionError) {
391        RELEASE_ASSERT_NOT_REACHED();
392        m_state = ParseError;
393        return;
394    }
395    ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
396
397    if (!hasCode()) {
398        ASSERT(m_state == NotCompiled);
399        vm->regExpCache()->addToStrongCache(this);
400        m_state = ByteCode;
401    }
402
403#if ENABLE(YARR_JIT)
404    if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
405        Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
406#if ENABLE(YARR_JIT_DEBUG)
407        if (!m_regExpJITCode.isFallBack())
408            m_state = JITCode;
409        else
410            m_state = ByteCode;
411#else
412        if (!m_regExpJITCode.isFallBack()) {
413            m_state = JITCode;
414            return;
415        }
416#endif
417    }
418#else
419    UNUSED_PARAM(charSize);
420#endif
421
422    m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
423}
424
425void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
426{
427    if (hasCode()) {
428#if ENABLE(YARR_JIT)
429        if (m_state != JITCode)
430            return;
431        if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
432            return;
433        if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
434            return;
435#else
436        return;
437#endif
438    }
439
440    compileMatchOnly(&vm, charSize);
441}
442
443MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
444{
445#if ENABLE(REGEXP_TRACING)
446    m_rtMatchCallCount++;
447#endif
448
449    ASSERT(m_state != ParseError);
450    compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
451
452#if ENABLE(YARR_JIT)
453    if (m_state == JITCode) {
454        MatchResult result = s.is8Bit() ?
455            m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
456            m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
457#if ENABLE(REGEXP_TRACING)
458        if (!result)
459            m_rtMatchFoundCount++;
460#endif
461        return result;
462    }
463#endif
464
465    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
466    int* offsetVector;
467    Vector<int, 32> nonReturnedOvector;
468    nonReturnedOvector.resize(offsetVectorSize);
469    offsetVector = nonReturnedOvector.data();
470    int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
471#if REGEXP_FUNC_TEST_DATA_GEN
472    RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
473#endif
474
475    if (r >= 0) {
476#if ENABLE(REGEXP_TRACING)
477        m_rtMatchFoundCount++;
478#endif
479        return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
480    }
481
482    return MatchResult::failed();
483}
484
485void RegExp::invalidateCode()
486{
487    if (!hasCode())
488        return;
489    m_state = NotCompiled;
490#if ENABLE(YARR_JIT)
491    m_regExpJITCode.clear();
492#endif
493    m_regExpBytecode.clear();
494}
495
496#if ENABLE(YARR_JIT_DEBUG)
497void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
498{
499    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
500    Vector<int, 32> interpreterOvector;
501    interpreterOvector.resize(offsetVectorSize);
502    int* interpreterOffsetVector = interpreterOvector.data();
503    int interpreterResult = 0;
504    int differences = 0;
505
506    // Initialize interpreterOffsetVector with the return value (index 0) and the
507    // first subpattern start indicies (even index values) set to -1.
508    // No need to init the subpattern end indicies.
509    for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
510        interpreterOffsetVector[j] = -1;
511
512    interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
513
514    if (jitResult != interpreterResult)
515        differences++;
516
517    for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
518        if ((offsetVector[j] != interpreterOffsetVector[j])
519            || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
520            differences++;
521
522    if (differences) {
523        dataLogF("RegExp Discrepency for /%s/\n    string input ", pattern().utf8().data());
524        unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
525
526        dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
527
528        if (jitResult != interpreterResult) {
529            dataLogF("    JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
530            differences--;
531        } else {
532            dataLogF("    Correct result = %d\n", jitResult);
533        }
534
535        if (differences) {
536            for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
537                if (offsetVector[j] != interpreterOffsetVector[j])
538                    dataLogF("    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
539                if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
540                    dataLogF("    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
541            }
542        }
543    }
544}
545#endif
546
547#if ENABLE(REGEXP_TRACING)
548    void RegExp::printTraceData()
549    {
550        char formattedPattern[41];
551        char rawPattern[41];
552
553        strncpy(rawPattern, pattern().utf8().data(), 40);
554        rawPattern[40]= '\0';
555
556        int pattLen = strlen(rawPattern);
557
558        snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
559
560#if ENABLE(YARR_JIT)
561        Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
562
563        const size_t jitAddrSize = 20;
564        char jitAddr[jitAddrSize];
565        if (m_state == JITCode)
566            snprintf(jitAddr, jitAddrSize, "fallback");
567        else
568            snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.getAddr()));
569#else
570        const char* jitAddr = "JIT Off";
571#endif
572
573        printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount);
574    }
575#endif
576
577} // namespace JSC
578