1/*
2 *  Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 *  Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 *  Copyright (C) 2009 Torch Mobile, Inc.
5 *  Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 *  This library is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU Lesser General Public
9 *  License as published by the Free Software Foundation; either
10 *  version 2 of the License, or (at your option) any later version.
11 *
12 *  This library is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 *  Lesser General Public License for more details.
16 *
17 *  You should have received a copy of the GNU Lesser General Public
18 *  License along with this library; if not, write to the Free Software
19 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 *
21 */
22
23#include "config.h"
24#include "RegExp.h"
25
26#include "Lexer.h"
27#include "JSCInlines.h"
28#include "RegExpCache.h"
29#include "Yarr.h"
30#include "YarrJIT.h"
31#include <wtf/Assertions.h>
32
33#define REGEXP_FUNC_TEST_DATA_GEN 0
34
35#if REGEXP_FUNC_TEST_DATA_GEN
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#endif
40
41namespace JSC {
42
43const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) };
44
45RegExpFlags regExpFlags(const String& string)
46{
47    RegExpFlags flags = NoFlags;
48
49    for (unsigned i = 0; i < string.length(); ++i) {
50        switch (string[i]) {
51        case 'g':
52            if (flags & FlagGlobal)
53                return InvalidFlags;
54            flags = static_cast<RegExpFlags>(flags | FlagGlobal);
55            break;
56
57        case 'i':
58            if (flags & FlagIgnoreCase)
59                return InvalidFlags;
60            flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
61            break;
62
63        case 'm':
64            if (flags & FlagMultiline)
65                return InvalidFlags;
66            flags = static_cast<RegExpFlags>(flags | FlagMultiline);
67            break;
68
69        default:
70            return InvalidFlags;
71        }
72    }
73
74    return flags;
75}
76
77#if REGEXP_FUNC_TEST_DATA_GEN
78class RegExpFunctionalTestCollector {
79    // This class is not thread safe.
80protected:
81    static const char* const s_fileName;
82
83public:
84    static RegExpFunctionalTestCollector* get();
85
86    ~RegExpFunctionalTestCollector();
87
88    void outputOneTest(RegExp*, String, int, int*, int);
89    void clearRegExp(RegExp* regExp)
90    {
91        if (regExp == m_lastRegExp)
92            m_lastRegExp = 0;
93    }
94
95private:
96    RegExpFunctionalTestCollector();
97
98    void outputEscapedString(const String&, bool escapeSlash = false);
99
100    static RegExpFunctionalTestCollector* s_instance;
101    FILE* m_file;
102    RegExp* m_lastRegExp;
103};
104
105const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
107
108RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
109{
110    if (!s_instance)
111        s_instance = new RegExpFunctionalTestCollector();
112
113    return s_instance;
114}
115
116void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result)
117{
118    if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
119        m_lastRegExp = regExp;
120        fputc('/', m_file);
121        outputEscapedString(regExp->pattern(), true);
122        fputc('/', m_file);
123        if (regExp->global())
124            fputc('g', m_file);
125        if (regExp->ignoreCase())
126            fputc('i', m_file);
127        if (regExp->multiline())
128            fputc('m', m_file);
129        fprintf(m_file, "\n");
130    }
131
132    fprintf(m_file, " \"");
133    outputEscapedString(s);
134    fprintf(m_file, "\", %d, %d, (", startOffset, result);
135    for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
136        int subpatternBegin = ovector[i * 2];
137        int subpatternEnd = ovector[i * 2 + 1];
138        if (subpatternBegin == -1)
139            subpatternEnd = -1;
140        fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
141        if (i < regExp->numSubpatterns())
142            fputs(", ", m_file);
143    }
144
145    fprintf(m_file, ")\n");
146    fflush(m_file);
147}
148
149RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
150{
151    m_file = fopen(s_fileName, "r+");
152    if  (!m_file)
153        m_file = fopen(s_fileName, "w+");
154
155    fseek(m_file, 0L, SEEK_END);
156}
157
158RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
159{
160    fclose(m_file);
161    s_instance = 0;
162}
163
164void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
165{
166    int len = s.length();
167
168    for (int i = 0; i < len; ++i) {
169        UChar c = s[i];
170
171        switch (c) {
172        case '\0':
173            fputs("\\0", m_file);
174            break;
175        case '\a':
176            fputs("\\a", m_file);
177            break;
178        case '\b':
179            fputs("\\b", m_file);
180            break;
181        case '\f':
182            fputs("\\f", m_file);
183            break;
184        case '\n':
185            fputs("\\n", m_file);
186            break;
187        case '\r':
188            fputs("\\r", m_file);
189            break;
190        case '\t':
191            fputs("\\t", m_file);
192            break;
193        case '\v':
194            fputs("\\v", m_file);
195            break;
196        case '/':
197            if (escapeSlash)
198                fputs("\\/", m_file);
199            else
200                fputs("/", m_file);
201            break;
202        case '\"':
203            fputs("\\\"", m_file);
204            break;
205        case '\\':
206            fputs("\\\\", m_file);
207            break;
208        case '\?':
209            fputs("\?", m_file);
210            break;
211        default:
212            if (c > 0x7f)
213                fprintf(m_file, "\\u%04x", c);
214            else
215                fputc(c, m_file);
216            break;
217        }
218    }
219}
220#endif
221
222RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
223    : JSCell(vm, vm.regExpStructure.get())
224    , m_state(NotCompiled)
225    , m_patternString(patternString)
226    , m_flags(flags)
227    , m_constructionError(0)
228    , m_numSubpatterns(0)
229#if ENABLE(REGEXP_TRACING)
230    , m_rtMatchOnlyTotalSubjectStringLen(0.0)
231    , m_rtMatchTotalSubjectStringLen(0.0)
232    , m_rtMatchOnlyCallCount(0)
233    , m_rtMatchOnlyFoundCount(0)
234    , m_rtMatchCallCount(0)
235    , m_rtMatchFoundCount(0)
236#endif
237{
238}
239
240void RegExp::finishCreation(VM& vm)
241{
242    Base::finishCreation(vm);
243    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
244    if (m_constructionError)
245        m_state = ParseError;
246    else
247        m_numSubpatterns = pattern.m_numSubpatterns;
248}
249
250void RegExp::destroy(JSCell* cell)
251{
252    RegExp* thisObject = static_cast<RegExp*>(cell);
253#if REGEXP_FUNC_TEST_DATA_GEN
254    RegExpFunctionalTestCollector::get()->clearRegExp(this);
255#endif
256    thisObject->RegExp::~RegExp();
257}
258
259RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
260{
261    RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
262    regExp->finishCreation(vm);
263    return regExp;
264}
265
266RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
267{
268    return vm.regExpCache()->lookupOrCreate(patternString, flags);
269}
270
271void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
272{
273    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
274    if (m_constructionError) {
275        RELEASE_ASSERT_NOT_REACHED();
276        m_state = ParseError;
277        return;
278    }
279    ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
280
281    if (!hasCode()) {
282        ASSERT(m_state == NotCompiled);
283        vm->regExpCache()->addToStrongCache(this);
284        m_state = ByteCode;
285    }
286
287#if ENABLE(YARR_JIT)
288    if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
289        Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
290#if ENABLE(YARR_JIT_DEBUG)
291        if (!m_regExpJITCode.isFallBack())
292            m_state = JITCode;
293        else
294            m_state = ByteCode;
295#else
296        if (!m_regExpJITCode.isFallBack()) {
297            m_state = JITCode;
298            return;
299        }
300#endif
301    }
302#else
303    UNUSED_PARAM(charSize);
304#endif
305
306    m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
307}
308
309void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
310{
311    if (hasCode()) {
312#if ENABLE(YARR_JIT)
313        if (m_state != JITCode)
314            return;
315        if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
316            return;
317        if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
318            return;
319#else
320        return;
321#endif
322    }
323
324    compile(&vm, charSize);
325}
326
327int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
328{
329#if ENABLE(REGEXP_TRACING)
330    m_rtMatchCallCount++;
331    m_rtMatchTotalSubjectStringLen += (double)(s.length() - startOffset);
332#endif
333
334    ASSERT(m_state != ParseError);
335    compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
336
337    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
338    ovector.resize(offsetVectorSize);
339    int* offsetVector = ovector.data();
340
341    int result;
342#if ENABLE(YARR_JIT)
343    if (m_state == JITCode) {
344        if (s.is8Bit())
345            result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
346        else
347            result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
348#if ENABLE(YARR_JIT_DEBUG)
349        matchCompareWithInterpreter(s, startOffset, offsetVector, result);
350#endif
351    } else
352#endif
353        result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
354
355    // FIXME: The YARR engine should handle unsigned or size_t length matches.
356    // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
357    // The offset vector handling needs to change as well.
358    // Right now we convert a match where the offsets overflowed into match failure.
359    // There are two places in WebCore that call the interpreter directly that need to
360    // have their offsets changed to int as well. They are yarr/RegularExpression.cpp
361    // and inspector/ContentSearchUtilities.cpp
362    if (s.length() > INT_MAX) {
363        bool overflowed = false;
364
365        if (result < -1)
366            overflowed = true;
367
368        for (unsigned i = 0; i <= m_numSubpatterns; i++) {
369            if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
370                overflowed = true;
371                offsetVector[i*2] = -1;
372                offsetVector[i*2+1] = -1;
373            }
374        }
375
376        if (overflowed)
377            result = -1;
378    }
379
380    ASSERT(result >= -1);
381
382#if REGEXP_FUNC_TEST_DATA_GEN
383    RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
384#endif
385
386#if ENABLE(REGEXP_TRACING)
387    if (result != -1)
388        m_rtMatchFoundCount++;
389#endif
390
391    return result;
392}
393
394void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
395{
396    Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
397    if (m_constructionError) {
398        RELEASE_ASSERT_NOT_REACHED();
399        m_state = ParseError;
400        return;
401    }
402    ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
403
404    if (!hasCode()) {
405        ASSERT(m_state == NotCompiled);
406        vm->regExpCache()->addToStrongCache(this);
407        m_state = ByteCode;
408    }
409
410#if ENABLE(YARR_JIT)
411    if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
412        Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
413#if ENABLE(YARR_JIT_DEBUG)
414        if (!m_regExpJITCode.isFallBack())
415            m_state = JITCode;
416        else
417            m_state = ByteCode;
418#else
419        if (!m_regExpJITCode.isFallBack()) {
420            m_state = JITCode;
421            return;
422        }
423#endif
424    }
425#else
426    UNUSED_PARAM(charSize);
427#endif
428
429    m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
430}
431
432void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
433{
434    if (hasCode()) {
435#if ENABLE(YARR_JIT)
436        if (m_state != JITCode)
437            return;
438        if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
439            return;
440        if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
441            return;
442#else
443        return;
444#endif
445    }
446
447    compileMatchOnly(&vm, charSize);
448}
449
450MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
451{
452#if ENABLE(REGEXP_TRACING)
453    m_rtMatchOnlyCallCount++;
454    m_rtMatchOnlyTotalSubjectStringLen += (double)(s.length() - startOffset);
455#endif
456
457    ASSERT(m_state != ParseError);
458    compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
459
460#if ENABLE(YARR_JIT)
461    if (m_state == JITCode) {
462        MatchResult result = s.is8Bit() ?
463            m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
464            m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
465#if ENABLE(REGEXP_TRACING)
466        if (!result)
467            m_rtMatchOnlyFoundCount++;
468#endif
469        return result;
470    }
471#endif
472
473    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
474    int* offsetVector;
475    Vector<int, 32> nonReturnedOvector;
476    nonReturnedOvector.resize(offsetVectorSize);
477    offsetVector = nonReturnedOvector.data();
478    int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
479#if REGEXP_FUNC_TEST_DATA_GEN
480    RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
481#endif
482
483    if (r >= 0) {
484#if ENABLE(REGEXP_TRACING)
485        m_rtMatchOnlyFoundCount++;
486#endif
487        return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
488    }
489
490    return MatchResult::failed();
491}
492
493void RegExp::invalidateCode()
494{
495    if (!hasCode())
496        return;
497    m_state = NotCompiled;
498#if ENABLE(YARR_JIT)
499    m_regExpJITCode.clear();
500#endif
501    m_regExpBytecode.clear();
502}
503
504#if ENABLE(YARR_JIT_DEBUG)
505void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
506{
507    int offsetVectorSize = (m_numSubpatterns + 1) * 2;
508    Vector<int, 32> interpreterOvector;
509    interpreterOvector.resize(offsetVectorSize);
510    int* interpreterOffsetVector = interpreterOvector.data();
511    int interpreterResult = 0;
512    int differences = 0;
513
514    // Initialize interpreterOffsetVector with the return value (index 0) and the
515    // first subpattern start indicies (even index values) set to -1.
516    // No need to init the subpattern end indicies.
517    for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
518        interpreterOffsetVector[j] = -1;
519
520    interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
521
522    if (jitResult != interpreterResult)
523        differences++;
524
525    for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
526        if ((offsetVector[j] != interpreterOffsetVector[j])
527            || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
528            differences++;
529
530    if (differences) {
531        dataLogF("RegExp Discrepency for /%s/\n    string input ", pattern().utf8().data());
532        unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
533
534        dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
535
536        if (jitResult != interpreterResult) {
537            dataLogF("    JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
538            differences--;
539        } else {
540            dataLogF("    Correct result = %d\n", jitResult);
541        }
542
543        if (differences) {
544            for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
545                if (offsetVector[j] != interpreterOffsetVector[j])
546                    dataLogF("    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
547                if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
548                    dataLogF("    JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
549            }
550        }
551    }
552}
553#endif
554
555#if ENABLE(REGEXP_TRACING)
556    void RegExp::printTraceData()
557    {
558        char formattedPattern[41];
559        char rawPattern[41];
560
561        strncpy(rawPattern, pattern().utf8().data(), 40);
562        rawPattern[40]= '\0';
563
564        int pattLen = strlen(rawPattern);
565
566        snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
567
568#if ENABLE(YARR_JIT)
569        Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
570
571        const size_t jitAddrSize = 20;
572        char jit8BitMatchOnlyAddr[jitAddrSize];
573        char jit16BitMatchOnlyAddr[jitAddrSize];
574        char jit8BitMatchAddr[jitAddrSize];
575        char jit16BitMatchAddr[jitAddrSize];
576        if (m_state == ByteCode) {
577            snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback    ");
578            snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "----      ");
579            snprintf(jit8BitMatchAddr, jitAddrSize, "fallback    ");
580            snprintf(jit16BitMatchAddr, jitAddrSize, "----      ");
581        } else {
582            snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchOnlyAddr()));
583            snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchOnlyAddr()));
584            snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchAddr()));
585            snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchAddr()));
586        }
587#else
588        const char* jit8BitMatchOnlyAddr = "JIT Off";
589        const char* jit16BitMatchOnlyAddr = "";
590        const char* jit8BitMatchAddr = "JIT Off";
591        const char* jit16BitMatchAddr = "";
592#endif
593        unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount);
594        unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount);
595
596        printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen);
597        printf("                                         %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen);
598    }
599#endif
600
601} // namespace JSC
602