1/* 2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org) 3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved. 4 * Copyright (C) 2009 Torch Mobile, Inc. 5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 * 21 */ 22 23#include "config.h" 24#include "RegExp.h" 25 26#include "Lexer.h" 27#include "JSCInlines.h" 28#include "RegExpCache.h" 29#include "Yarr.h" 30#include "YarrJIT.h" 31#include <wtf/Assertions.h> 32 33#define REGEXP_FUNC_TEST_DATA_GEN 0 34 35#if REGEXP_FUNC_TEST_DATA_GEN 36#include <stdio.h> 37#include <stdlib.h> 38#include <string.h> 39#endif 40 41namespace JSC { 42 43const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) }; 44 45RegExpFlags regExpFlags(const String& string) 46{ 47 RegExpFlags flags = NoFlags; 48 49 for (unsigned i = 0; i < string.length(); ++i) { 50 switch (string[i]) { 51 case 'g': 52 if (flags & FlagGlobal) 53 return InvalidFlags; 54 flags = static_cast<RegExpFlags>(flags | FlagGlobal); 55 break; 56 57 case 'i': 58 if (flags & FlagIgnoreCase) 59 return InvalidFlags; 60 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase); 61 break; 62 63 case 'm': 64 if (flags & FlagMultiline) 65 return InvalidFlags; 66 flags = static_cast<RegExpFlags>(flags | FlagMultiline); 67 break; 68 69 default: 70 return InvalidFlags; 71 } 72 } 73 74 return flags; 75} 76 77#if REGEXP_FUNC_TEST_DATA_GEN 78class RegExpFunctionalTestCollector { 79 // This class is not thread safe. 80protected: 81 static const char* const s_fileName; 82 83public: 84 static RegExpFunctionalTestCollector* get(); 85 86 ~RegExpFunctionalTestCollector(); 87 88 void outputOneTest(RegExp*, String, int, int*, int); 89 void clearRegExp(RegExp* regExp) 90 { 91 if (regExp == m_lastRegExp) 92 m_lastRegExp = 0; 93 } 94 95private: 96 RegExpFunctionalTestCollector(); 97 98 void outputEscapedString(const String&, bool escapeSlash = false); 99 100 static RegExpFunctionalTestCollector* s_instance; 101 FILE* m_file; 102 RegExp* m_lastRegExp; 103}; 104 105const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData"; 106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0; 107 108RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get() 109{ 110 if (!s_instance) 111 s_instance = new RegExpFunctionalTestCollector(); 112 113 return s_instance; 114} 115 116void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result) 117{ 118 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) { 119 m_lastRegExp = regExp; 120 fputc('/', m_file); 121 outputEscapedString(regExp->pattern(), true); 122 fputc('/', m_file); 123 if (regExp->global()) 124 fputc('g', m_file); 125 if (regExp->ignoreCase()) 126 fputc('i', m_file); 127 if (regExp->multiline()) 128 fputc('m', m_file); 129 fprintf(m_file, "\n"); 130 } 131 132 fprintf(m_file, " \""); 133 outputEscapedString(s); 134 fprintf(m_file, "\", %d, %d, (", startOffset, result); 135 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) { 136 int subpatternBegin = ovector[i * 2]; 137 int subpatternEnd = ovector[i * 2 + 1]; 138 if (subpatternBegin == -1) 139 subpatternEnd = -1; 140 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd); 141 if (i < regExp->numSubpatterns()) 142 fputs(", ", m_file); 143 } 144 145 fprintf(m_file, ")\n"); 146 fflush(m_file); 147} 148 149RegExpFunctionalTestCollector::RegExpFunctionalTestCollector() 150{ 151 m_file = fopen(s_fileName, "r+"); 152 if (!m_file) 153 m_file = fopen(s_fileName, "w+"); 154 155 fseek(m_file, 0L, SEEK_END); 156} 157 158RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector() 159{ 160 fclose(m_file); 161 s_instance = 0; 162} 163 164void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash) 165{ 166 int len = s.length(); 167 168 for (int i = 0; i < len; ++i) { 169 UChar c = s[i]; 170 171 switch (c) { 172 case '\0': 173 fputs("\\0", m_file); 174 break; 175 case '\a': 176 fputs("\\a", m_file); 177 break; 178 case '\b': 179 fputs("\\b", m_file); 180 break; 181 case '\f': 182 fputs("\\f", m_file); 183 break; 184 case '\n': 185 fputs("\\n", m_file); 186 break; 187 case '\r': 188 fputs("\\r", m_file); 189 break; 190 case '\t': 191 fputs("\\t", m_file); 192 break; 193 case '\v': 194 fputs("\\v", m_file); 195 break; 196 case '/': 197 if (escapeSlash) 198 fputs("\\/", m_file); 199 else 200 fputs("/", m_file); 201 break; 202 case '\"': 203 fputs("\\\"", m_file); 204 break; 205 case '\\': 206 fputs("\\\\", m_file); 207 break; 208 case '\?': 209 fputs("\?", m_file); 210 break; 211 default: 212 if (c > 0x7f) 213 fprintf(m_file, "\\u%04x", c); 214 else 215 fputc(c, m_file); 216 break; 217 } 218 } 219} 220#endif 221 222RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags) 223 : JSCell(vm, vm.regExpStructure.get()) 224 , m_state(NotCompiled) 225 , m_patternString(patternString) 226 , m_flags(flags) 227 , m_constructionError(0) 228 , m_numSubpatterns(0) 229#if ENABLE(REGEXP_TRACING) 230 , m_rtMatchOnlyTotalSubjectStringLen(0.0) 231 , m_rtMatchTotalSubjectStringLen(0.0) 232 , m_rtMatchOnlyCallCount(0) 233 , m_rtMatchOnlyFoundCount(0) 234 , m_rtMatchCallCount(0) 235 , m_rtMatchFoundCount(0) 236#endif 237{ 238} 239 240void RegExp::finishCreation(VM& vm) 241{ 242 Base::finishCreation(vm); 243 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 244 if (m_constructionError) 245 m_state = ParseError; 246 else 247 m_numSubpatterns = pattern.m_numSubpatterns; 248} 249 250void RegExp::destroy(JSCell* cell) 251{ 252 RegExp* thisObject = static_cast<RegExp*>(cell); 253#if REGEXP_FUNC_TEST_DATA_GEN 254 RegExpFunctionalTestCollector::get()->clearRegExp(this); 255#endif 256 thisObject->RegExp::~RegExp(); 257} 258 259RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags) 260{ 261 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags); 262 regExp->finishCreation(vm); 263 return regExp; 264} 265 266RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags) 267{ 268 return vm.regExpCache()->lookupOrCreate(patternString, flags); 269} 270 271void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize) 272{ 273 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 274 if (m_constructionError) { 275 RELEASE_ASSERT_NOT_REACHED(); 276 m_state = ParseError; 277 return; 278 } 279 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); 280 281 if (!hasCode()) { 282 ASSERT(m_state == NotCompiled); 283 vm->regExpCache()->addToStrongCache(this); 284 m_state = ByteCode; 285 } 286 287#if ENABLE(YARR_JIT) 288 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) { 289 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode); 290#if ENABLE(YARR_JIT_DEBUG) 291 if (!m_regExpJITCode.isFallBack()) 292 m_state = JITCode; 293 else 294 m_state = ByteCode; 295#else 296 if (!m_regExpJITCode.isFallBack()) { 297 m_state = JITCode; 298 return; 299 } 300#endif 301 } 302#else 303 UNUSED_PARAM(charSize); 304#endif 305 306 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator); 307} 308 309void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize) 310{ 311 if (hasCode()) { 312#if ENABLE(YARR_JIT) 313 if (m_state != JITCode) 314 return; 315 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode())) 316 return; 317 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode())) 318 return; 319#else 320 return; 321#endif 322 } 323 324 compile(&vm, charSize); 325} 326 327int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector) 328{ 329#if ENABLE(REGEXP_TRACING) 330 m_rtMatchCallCount++; 331 m_rtMatchTotalSubjectStringLen += (double)(s.length() - startOffset); 332#endif 333 334 ASSERT(m_state != ParseError); 335 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); 336 337 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 338 ovector.resize(offsetVectorSize); 339 int* offsetVector = ovector.data(); 340 341 int result; 342#if ENABLE(YARR_JIT) 343 if (m_state == JITCode) { 344 if (s.is8Bit()) 345 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start; 346 else 347 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start; 348#if ENABLE(YARR_JIT_DEBUG) 349 matchCompareWithInterpreter(s, startOffset, offsetVector, result); 350#endif 351 } else 352#endif 353 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector)); 354 355 // FIXME: The YARR engine should handle unsigned or size_t length matches. 356 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed. 357 // The offset vector handling needs to change as well. 358 // Right now we convert a match where the offsets overflowed into match failure. 359 // There are two places in WebCore that call the interpreter directly that need to 360 // have their offsets changed to int as well. They are yarr/RegularExpression.cpp 361 // and inspector/ContentSearchUtilities.cpp 362 if (s.length() > INT_MAX) { 363 bool overflowed = false; 364 365 if (result < -1) 366 overflowed = true; 367 368 for (unsigned i = 0; i <= m_numSubpatterns; i++) { 369 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) { 370 overflowed = true; 371 offsetVector[i*2] = -1; 372 offsetVector[i*2+1] = -1; 373 } 374 } 375 376 if (overflowed) 377 result = -1; 378 } 379 380 ASSERT(result >= -1); 381 382#if REGEXP_FUNC_TEST_DATA_GEN 383 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); 384#endif 385 386#if ENABLE(REGEXP_TRACING) 387 if (result != -1) 388 m_rtMatchFoundCount++; 389#endif 390 391 return result; 392} 393 394void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize) 395{ 396 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 397 if (m_constructionError) { 398 RELEASE_ASSERT_NOT_REACHED(); 399 m_state = ParseError; 400 return; 401 } 402 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); 403 404 if (!hasCode()) { 405 ASSERT(m_state == NotCompiled); 406 vm->regExpCache()->addToStrongCache(this); 407 m_state = ByteCode; 408 } 409 410#if ENABLE(YARR_JIT) 411 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) { 412 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly); 413#if ENABLE(YARR_JIT_DEBUG) 414 if (!m_regExpJITCode.isFallBack()) 415 m_state = JITCode; 416 else 417 m_state = ByteCode; 418#else 419 if (!m_regExpJITCode.isFallBack()) { 420 m_state = JITCode; 421 return; 422 } 423#endif 424 } 425#else 426 UNUSED_PARAM(charSize); 427#endif 428 429 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator); 430} 431 432void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize) 433{ 434 if (hasCode()) { 435#if ENABLE(YARR_JIT) 436 if (m_state != JITCode) 437 return; 438 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly())) 439 return; 440 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly())) 441 return; 442#else 443 return; 444#endif 445 } 446 447 compileMatchOnly(&vm, charSize); 448} 449 450MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset) 451{ 452#if ENABLE(REGEXP_TRACING) 453 m_rtMatchOnlyCallCount++; 454 m_rtMatchOnlyTotalSubjectStringLen += (double)(s.length() - startOffset); 455#endif 456 457 ASSERT(m_state != ParseError); 458 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); 459 460#if ENABLE(YARR_JIT) 461 if (m_state == JITCode) { 462 MatchResult result = s.is8Bit() ? 463 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) : 464 m_regExpJITCode.execute(s.characters16(), startOffset, s.length()); 465#if ENABLE(REGEXP_TRACING) 466 if (!result) 467 m_rtMatchOnlyFoundCount++; 468#endif 469 return result; 470 } 471#endif 472 473 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 474 int* offsetVector; 475 Vector<int, 32> nonReturnedOvector; 476 nonReturnedOvector.resize(offsetVectorSize); 477 offsetVector = nonReturnedOvector.data(); 478 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector)); 479#if REGEXP_FUNC_TEST_DATA_GEN 480 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); 481#endif 482 483 if (r >= 0) { 484#if ENABLE(REGEXP_TRACING) 485 m_rtMatchOnlyFoundCount++; 486#endif 487 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]); 488 } 489 490 return MatchResult::failed(); 491} 492 493void RegExp::invalidateCode() 494{ 495 if (!hasCode()) 496 return; 497 m_state = NotCompiled; 498#if ENABLE(YARR_JIT) 499 m_regExpJITCode.clear(); 500#endif 501 m_regExpBytecode.clear(); 502} 503 504#if ENABLE(YARR_JIT_DEBUG) 505void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult) 506{ 507 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 508 Vector<int, 32> interpreterOvector; 509 interpreterOvector.resize(offsetVectorSize); 510 int* interpreterOffsetVector = interpreterOvector.data(); 511 int interpreterResult = 0; 512 int differences = 0; 513 514 // Initialize interpreterOffsetVector with the return value (index 0) and the 515 // first subpattern start indicies (even index values) set to -1. 516 // No need to init the subpattern end indicies. 517 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++) 518 interpreterOffsetVector[j] = -1; 519 520 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector); 521 522 if (jitResult != interpreterResult) 523 differences++; 524 525 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) 526 if ((offsetVector[j] != interpreterOffsetVector[j]) 527 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))) 528 differences++; 529 530 if (differences) { 531 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data()); 532 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset); 533 534 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset); 535 536 if (jitResult != interpreterResult) { 537 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult); 538 differences--; 539 } else { 540 dataLogF(" Correct result = %d\n", jitResult); 541 } 542 543 if (differences) { 544 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) { 545 if (offsetVector[j] != interpreterOffsetVector[j]) 546 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]); 547 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])) 548 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]); 549 } 550 } 551 } 552} 553#endif 554 555#if ENABLE(REGEXP_TRACING) 556 void RegExp::printTraceData() 557 { 558 char formattedPattern[41]; 559 char rawPattern[41]; 560 561 strncpy(rawPattern, pattern().utf8().data(), 40); 562 rawPattern[40]= '\0'; 563 564 int pattLen = strlen(rawPattern); 565 566 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern); 567 568#if ENABLE(YARR_JIT) 569 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode; 570 571 const size_t jitAddrSize = 20; 572 char jit8BitMatchOnlyAddr[jitAddrSize]; 573 char jit16BitMatchOnlyAddr[jitAddrSize]; 574 char jit8BitMatchAddr[jitAddrSize]; 575 char jit16BitMatchAddr[jitAddrSize]; 576 if (m_state == ByteCode) { 577 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback "); 578 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- "); 579 snprintf(jit8BitMatchAddr, jitAddrSize, "fallback "); 580 snprintf(jit16BitMatchAddr, jitAddrSize, "---- "); 581 } else { 582 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchOnlyAddr())); 583 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchOnlyAddr())); 584 snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchAddr())); 585 snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchAddr())); 586 } 587#else 588 const char* jit8BitMatchOnlyAddr = "JIT Off"; 589 const char* jit16BitMatchOnlyAddr = ""; 590 const char* jit8BitMatchAddr = "JIT Off"; 591 const char* jit16BitMatchAddr = ""; 592#endif 593 unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount); 594 unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount); 595 596 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen); 597 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen); 598 } 599#endif 600 601} // namespace JSC 602