1/* 2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org) 3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved. 4 * Copyright (C) 2009 Torch Mobile, Inc. 5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 * 21 */ 22 23#include "config.h" 24#include "RegExp.h" 25 26#include "Lexer.h" 27#include "Operations.h" 28#include "RegExpCache.h" 29#include "Yarr.h" 30#include "YarrJIT.h" 31#include <stdio.h> 32#include <stdlib.h> 33#include <string.h> 34#include <wtf/Assertions.h> 35#include <wtf/OwnArrayPtr.h> 36 37#define REGEXP_FUNC_TEST_DATA_GEN 0 38 39namespace JSC { 40 41const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) }; 42 43RegExpFlags regExpFlags(const String& string) 44{ 45 RegExpFlags flags = NoFlags; 46 47 for (unsigned i = 0; i < string.length(); ++i) { 48 switch (string[i]) { 49 case 'g': 50 if (flags & FlagGlobal) 51 return InvalidFlags; 52 flags = static_cast<RegExpFlags>(flags | FlagGlobal); 53 break; 54 55 case 'i': 56 if (flags & FlagIgnoreCase) 57 return InvalidFlags; 58 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase); 59 break; 60 61 case 'm': 62 if (flags & FlagMultiline) 63 return InvalidFlags; 64 flags = static_cast<RegExpFlags>(flags | FlagMultiline); 65 break; 66 67 default: 68 return InvalidFlags; 69 } 70 } 71 72 return flags; 73} 74 75#if REGEXP_FUNC_TEST_DATA_GEN 76class RegExpFunctionalTestCollector { 77 // This class is not thread safe. 78protected: 79 static const char* const s_fileName; 80 81public: 82 static RegExpFunctionalTestCollector* get(); 83 84 ~RegExpFunctionalTestCollector(); 85 86 void outputOneTest(RegExp*, String, int, int*, int); 87 void clearRegExp(RegExp* regExp) 88 { 89 if (regExp == m_lastRegExp) 90 m_lastRegExp = 0; 91 } 92 93private: 94 RegExpFunctionalTestCollector(); 95 96 void outputEscapedString(const String&, bool escapeSlash = false); 97 98 static RegExpFunctionalTestCollector* s_instance; 99 FILE* m_file; 100 RegExp* m_lastRegExp; 101}; 102 103const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData"; 104RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0; 105 106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get() 107{ 108 if (!s_instance) 109 s_instance = new RegExpFunctionalTestCollector(); 110 111 return s_instance; 112} 113 114void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result) 115{ 116 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) { 117 m_lastRegExp = regExp; 118 fputc('/', m_file); 119 outputEscapedString(regExp->pattern(), true); 120 fputc('/', m_file); 121 if (regExp->global()) 122 fputc('g', m_file); 123 if (regExp->ignoreCase()) 124 fputc('i', m_file); 125 if (regExp->multiline()) 126 fputc('m', m_file); 127 fprintf(m_file, "\n"); 128 } 129 130 fprintf(m_file, " \""); 131 outputEscapedString(s); 132 fprintf(m_file, "\", %d, %d, (", startOffset, result); 133 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) { 134 int subpatternBegin = ovector[i * 2]; 135 int subpatternEnd = ovector[i * 2 + 1]; 136 if (subpatternBegin == -1) 137 subpatternEnd = -1; 138 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd); 139 if (i < regExp->numSubpatterns()) 140 fputs(", ", m_file); 141 } 142 143 fprintf(m_file, ")\n"); 144 fflush(m_file); 145} 146 147RegExpFunctionalTestCollector::RegExpFunctionalTestCollector() 148{ 149 m_file = fopen(s_fileName, "r+"); 150 if (!m_file) 151 m_file = fopen(s_fileName, "w+"); 152 153 fseek(m_file, 0L, SEEK_END); 154} 155 156RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector() 157{ 158 fclose(m_file); 159 s_instance = 0; 160} 161 162void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash) 163{ 164 int len = s.length(); 165 166 for (int i = 0; i < len; ++i) { 167 UChar c = s[i]; 168 169 switch (c) { 170 case '\0': 171 fputs("\\0", m_file); 172 break; 173 case '\a': 174 fputs("\\a", m_file); 175 break; 176 case '\b': 177 fputs("\\b", m_file); 178 break; 179 case '\f': 180 fputs("\\f", m_file); 181 break; 182 case '\n': 183 fputs("\\n", m_file); 184 break; 185 case '\r': 186 fputs("\\r", m_file); 187 break; 188 case '\t': 189 fputs("\\t", m_file); 190 break; 191 case '\v': 192 fputs("\\v", m_file); 193 break; 194 case '/': 195 if (escapeSlash) 196 fputs("\\/", m_file); 197 else 198 fputs("/", m_file); 199 break; 200 case '\"': 201 fputs("\\\"", m_file); 202 break; 203 case '\\': 204 fputs("\\\\", m_file); 205 break; 206 case '\?': 207 fputs("\?", m_file); 208 break; 209 default: 210 if (c > 0x7f) 211 fprintf(m_file, "\\u%04x", c); 212 else 213 fputc(c, m_file); 214 break; 215 } 216 } 217} 218#endif 219 220RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags) 221 : JSCell(vm, vm.regExpStructure.get()) 222 , m_state(NotCompiled) 223 , m_patternString(patternString) 224 , m_flags(flags) 225 , m_constructionError(0) 226 , m_numSubpatterns(0) 227#if ENABLE(REGEXP_TRACING) 228 , m_rtMatchCallCount(0) 229 , m_rtMatchFoundCount(0) 230#endif 231{ 232} 233 234void RegExp::finishCreation(VM& vm) 235{ 236 Base::finishCreation(vm); 237 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 238 if (m_constructionError) 239 m_state = ParseError; 240 else 241 m_numSubpatterns = pattern.m_numSubpatterns; 242} 243 244void RegExp::destroy(JSCell* cell) 245{ 246 RegExp* thisObject = static_cast<RegExp*>(cell); 247#if REGEXP_FUNC_TEST_DATA_GEN 248 RegExpFunctionalTestCollector::get()->clearRegExp(this); 249#endif 250 thisObject->RegExp::~RegExp(); 251} 252 253RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags) 254{ 255 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags); 256 regExp->finishCreation(vm); 257 return regExp; 258} 259 260RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags) 261{ 262 return vm.regExpCache()->lookupOrCreate(patternString, flags); 263} 264 265void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize) 266{ 267 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 268 if (m_constructionError) { 269 RELEASE_ASSERT_NOT_REACHED(); 270 m_state = ParseError; 271 return; 272 } 273 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); 274 275 if (!hasCode()) { 276 ASSERT(m_state == NotCompiled); 277 vm->regExpCache()->addToStrongCache(this); 278 m_state = ByteCode; 279 } 280 281#if ENABLE(YARR_JIT) 282 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) { 283 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode); 284#if ENABLE(YARR_JIT_DEBUG) 285 if (!m_regExpJITCode.isFallBack()) 286 m_state = JITCode; 287 else 288 m_state = ByteCode; 289#else 290 if (!m_regExpJITCode.isFallBack()) { 291 m_state = JITCode; 292 return; 293 } 294#endif 295 } 296#else 297 UNUSED_PARAM(charSize); 298#endif 299 300 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator); 301} 302 303void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize) 304{ 305 if (hasCode()) { 306#if ENABLE(YARR_JIT) 307 if (m_state != JITCode) 308 return; 309 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode())) 310 return; 311 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode())) 312 return; 313#else 314 return; 315#endif 316 } 317 318 compile(&vm, charSize); 319} 320 321int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector) 322{ 323#if ENABLE(REGEXP_TRACING) 324 m_rtMatchCallCount++; 325#endif 326 327 ASSERT(m_state != ParseError); 328 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); 329 330 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 331 ovector.resize(offsetVectorSize); 332 int* offsetVector = ovector.data(); 333 334 int result; 335#if ENABLE(YARR_JIT) 336 if (m_state == JITCode) { 337 if (s.is8Bit()) 338 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start; 339 else 340 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start; 341#if ENABLE(YARR_JIT_DEBUG) 342 matchCompareWithInterpreter(s, startOffset, offsetVector, result); 343#endif 344 } else 345#endif 346 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector)); 347 348 // FIXME: The YARR engine should handle unsigned or size_t length matches. 349 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed. 350 // The offset vector handling needs to change as well. 351 // Right now we convert a match where the offsets overflowed into match failure. 352 // There are two places in WebCore that call the interpreter directly that need to 353 // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp 354 // and inspector/ContentSearchUtils.cpp. 355 if (s.length() > INT_MAX) { 356 bool overflowed = false; 357 358 if (result < -1) 359 overflowed = true; 360 361 for (unsigned i = 0; i <= m_numSubpatterns; i++) { 362 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) { 363 overflowed = true; 364 offsetVector[i*2] = -1; 365 offsetVector[i*2+1] = -1; 366 } 367 } 368 369 if (overflowed) 370 result = -1; 371 } 372 373 ASSERT(result >= -1); 374 375#if REGEXP_FUNC_TEST_DATA_GEN 376 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); 377#endif 378 379#if ENABLE(REGEXP_TRACING) 380 if (result != -1) 381 m_rtMatchFoundCount++; 382#endif 383 384 return result; 385} 386 387void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize) 388{ 389 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); 390 if (m_constructionError) { 391 RELEASE_ASSERT_NOT_REACHED(); 392 m_state = ParseError; 393 return; 394 } 395 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); 396 397 if (!hasCode()) { 398 ASSERT(m_state == NotCompiled); 399 vm->regExpCache()->addToStrongCache(this); 400 m_state = ByteCode; 401 } 402 403#if ENABLE(YARR_JIT) 404 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) { 405 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly); 406#if ENABLE(YARR_JIT_DEBUG) 407 if (!m_regExpJITCode.isFallBack()) 408 m_state = JITCode; 409 else 410 m_state = ByteCode; 411#else 412 if (!m_regExpJITCode.isFallBack()) { 413 m_state = JITCode; 414 return; 415 } 416#endif 417 } 418#else 419 UNUSED_PARAM(charSize); 420#endif 421 422 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator); 423} 424 425void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize) 426{ 427 if (hasCode()) { 428#if ENABLE(YARR_JIT) 429 if (m_state != JITCode) 430 return; 431 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly())) 432 return; 433 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly())) 434 return; 435#else 436 return; 437#endif 438 } 439 440 compileMatchOnly(&vm, charSize); 441} 442 443MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset) 444{ 445#if ENABLE(REGEXP_TRACING) 446 m_rtMatchCallCount++; 447#endif 448 449 ASSERT(m_state != ParseError); 450 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); 451 452#if ENABLE(YARR_JIT) 453 if (m_state == JITCode) { 454 MatchResult result = s.is8Bit() ? 455 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) : 456 m_regExpJITCode.execute(s.characters16(), startOffset, s.length()); 457#if ENABLE(REGEXP_TRACING) 458 if (!result) 459 m_rtMatchFoundCount++; 460#endif 461 return result; 462 } 463#endif 464 465 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 466 int* offsetVector; 467 Vector<int, 32> nonReturnedOvector; 468 nonReturnedOvector.resize(offsetVectorSize); 469 offsetVector = nonReturnedOvector.data(); 470 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector)); 471#if REGEXP_FUNC_TEST_DATA_GEN 472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); 473#endif 474 475 if (r >= 0) { 476#if ENABLE(REGEXP_TRACING) 477 m_rtMatchFoundCount++; 478#endif 479 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]); 480 } 481 482 return MatchResult::failed(); 483} 484 485void RegExp::invalidateCode() 486{ 487 if (!hasCode()) 488 return; 489 m_state = NotCompiled; 490#if ENABLE(YARR_JIT) 491 m_regExpJITCode.clear(); 492#endif 493 m_regExpBytecode.clear(); 494} 495 496#if ENABLE(YARR_JIT_DEBUG) 497void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult) 498{ 499 int offsetVectorSize = (m_numSubpatterns + 1) * 2; 500 Vector<int, 32> interpreterOvector; 501 interpreterOvector.resize(offsetVectorSize); 502 int* interpreterOffsetVector = interpreterOvector.data(); 503 int interpreterResult = 0; 504 int differences = 0; 505 506 // Initialize interpreterOffsetVector with the return value (index 0) and the 507 // first subpattern start indicies (even index values) set to -1. 508 // No need to init the subpattern end indicies. 509 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++) 510 interpreterOffsetVector[j] = -1; 511 512 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector); 513 514 if (jitResult != interpreterResult) 515 differences++; 516 517 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) 518 if ((offsetVector[j] != interpreterOffsetVector[j]) 519 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))) 520 differences++; 521 522 if (differences) { 523 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data()); 524 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset); 525 526 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset); 527 528 if (jitResult != interpreterResult) { 529 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult); 530 differences--; 531 } else { 532 dataLogF(" Correct result = %d\n", jitResult); 533 } 534 535 if (differences) { 536 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) { 537 if (offsetVector[j] != interpreterOffsetVector[j]) 538 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]); 539 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])) 540 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]); 541 } 542 } 543 } 544} 545#endif 546 547#if ENABLE(REGEXP_TRACING) 548 void RegExp::printTraceData() 549 { 550 char formattedPattern[41]; 551 char rawPattern[41]; 552 553 strncpy(rawPattern, pattern().utf8().data(), 40); 554 rawPattern[40]= '\0'; 555 556 int pattLen = strlen(rawPattern); 557 558 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern); 559 560#if ENABLE(YARR_JIT) 561 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode; 562 563 const size_t jitAddrSize = 20; 564 char jitAddr[jitAddrSize]; 565 if (m_state == JITCode) 566 snprintf(jitAddr, jitAddrSize, "fallback"); 567 else 568 snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.getAddr())); 569#else 570 const char* jitAddr = "JIT Off"; 571#endif 572 573 printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount); 574 } 575#endif 576 577} // namespace JSC 578