1/* 2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 3 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved. 4 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu) 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 * 21 */ 22 23#ifndef Lexer_h 24#define Lexer_h 25 26#include "Lookup.h" 27#include "ParserArena.h" 28#include "ParserTokens.h" 29#include "SourceCode.h" 30#include <wtf/ASCIICType.h> 31#include <wtf/SegmentedVector.h> 32#include <wtf/Vector.h> 33 34namespace JSC { 35 36class Keywords { 37public: 38 bool isKeyword(const Identifier& ident) const 39 { 40 return m_keywordTable.entry(m_vm, ident); 41 } 42 43 const HashTableValue* getKeyword(const Identifier& ident) const 44 { 45 return m_keywordTable.entry(m_vm, ident); 46 } 47 48 ~Keywords() 49 { 50 m_keywordTable.deleteTable(); 51 } 52 53private: 54 friend class VM; 55 56 explicit Keywords(VM&); 57 58 VM& m_vm; 59 const HashTable m_keywordTable; 60}; 61 62enum LexerFlags { 63 LexerFlagsIgnoreReservedWords = 1, 64 LexerFlagsDontBuildStrings = 2, 65 LexexFlagsDontBuildKeywords = 4 66}; 67 68template <typename T> 69class Lexer { 70 WTF_MAKE_NONCOPYABLE(Lexer); 71 WTF_MAKE_FAST_ALLOCATED; 72 73public: 74 Lexer(VM*, JSParserStrictness); 75 ~Lexer(); 76 77 // Character manipulation functions. 78 static bool isWhiteSpace(T character); 79 static bool isLineTerminator(T character); 80 static unsigned char convertHex(int c1, int c2); 81 static UChar convertUnicode(int c1, int c2, int c3, int c4); 82 83 // Functions to set up parsing. 84 void setCode(const SourceCode&, ParserArena*); 85 void setIsReparsing() { m_isReparsing = true; } 86 bool isReparsing() const { return m_isReparsing; } 87 88 JSTokenType lex(JSToken*, unsigned, bool strictMode); 89 bool nextTokenIsColon(); 90 int lineNumber() const { return m_lineNumber; } 91 ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); } 92 ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); } 93 ALWAYS_INLINE JSTextPosition currentPosition() const 94 { 95 return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset()); 96 } 97 JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; } 98 void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; } 99 int lastLineNumber() const { return m_lastLineNumber; } 100 bool prevTerminator() const { return m_terminator; } 101 bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0); 102 bool skipRegExp(); 103 104 // Functions for use after parsing. 105 bool sawError() const { return m_error; } 106 String getErrorMessage() const { return m_lexErrorMessage; } 107 void clear(); 108 void setOffset(int offset, int lineStartOffset) 109 { 110 m_error = 0; 111 m_lexErrorMessage = String(); 112 113 m_code = sourcePtrFromOffset(offset); 114 m_lineStart = sourcePtrFromOffset(lineStartOffset); 115 ASSERT(currentOffset() >= currentLineStartOffset()); 116 117 m_buffer8.resize(0); 118 m_buffer16.resize(0); 119 if (LIKELY(m_code < m_codeEnd)) 120 m_current = *m_code; 121 else 122 m_current = 0; 123 } 124 void setLineNumber(int line) 125 { 126 m_lineNumber = line; 127 } 128 129 SourceProvider* sourceProvider() const { return m_source->provider(); } 130 131 JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode); 132 133private: 134 void record8(int); 135 void append8(const T*, size_t); 136 void record16(int); 137 void record16(T); 138 void append16(const LChar*, size_t); 139 void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); } 140 141 ALWAYS_INLINE void shift(); 142 ALWAYS_INLINE bool atEnd() const; 143 ALWAYS_INLINE T peek(int offset) const; 144 struct UnicodeHexValue { 145 146 enum ValueType { ValidHex, IncompleteHex, InvalidHex }; 147 148 explicit UnicodeHexValue(int value) 149 : m_value(value) 150 { 151 } 152 explicit UnicodeHexValue(ValueType type) 153 : m_value(type == IncompleteHex ? -2 : -1) 154 { 155 } 156 157 ValueType valueType() const 158 { 159 if (m_value >= 0) 160 return ValidHex; 161 return m_value == -2 ? IncompleteHex : InvalidHex; 162 } 163 bool isValid() const { return m_value >= 0; } 164 int value() const 165 { 166 ASSERT(m_value >= 0); 167 return m_value; 168 } 169 170 private: 171 int m_value; 172 }; 173 UnicodeHexValue parseFourDigitUnicodeHex(); 174 void shiftLineTerminator(); 175 176 ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; } 177 ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; } 178 179 String invalidCharacterMessage() const; 180 ALWAYS_INLINE const T* currentSourcePtr() const; 181 ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); } 182 183 ALWAYS_INLINE void setCodeStart(const StringImpl*); 184 185 ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length); 186 ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length); 187 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length); 188 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length); 189 ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars); 190 ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length); 191 192 ALWAYS_INLINE bool lastTokenWasRestrKeyword() const; 193 194 template <int shiftAmount> void internalShift(); 195 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*); 196 template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode); 197 template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode); 198 enum StringParseResult { 199 StringParsedSuccessfully, 200 StringUnterminated, 201 StringCannotBeParsed 202 }; 203 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode); 204 template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode); 205 ALWAYS_INLINE void parseHex(double& returnValue); 206 ALWAYS_INLINE bool parseOctal(double& returnValue); 207 ALWAYS_INLINE bool parseDecimal(double& returnValue); 208 ALWAYS_INLINE void parseNumberAfterDecimalPoint(); 209 ALWAYS_INLINE bool parseNumberAfterExponentIndicator(); 210 ALWAYS_INLINE bool parseMultilineComment(); 211 212 static const size_t initialReadBufferCapacity = 32; 213 214 int m_lineNumber; 215 int m_lastLineNumber; 216 217 Vector<LChar> m_buffer8; 218 Vector<UChar> m_buffer16; 219 bool m_terminator; 220 int m_lastToken; 221 222 const SourceCode* m_source; 223 unsigned m_sourceOffset; 224 const T* m_code; 225 const T* m_codeStart; 226 const T* m_codeEnd; 227 const T* m_codeStartPlusOffset; 228 const T* m_lineStart; 229 JSTextPosition m_positionBeforeLastNewline; 230 bool m_isReparsing; 231 bool m_atLineStart; 232 bool m_error; 233 String m_lexErrorMessage; 234 235 T m_current; 236 237 IdentifierArena* m_arena; 238 239 VM* m_vm; 240 bool m_parsingBuiltinFunction; 241}; 242 243template <> 244ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch) 245{ 246 return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0; 247} 248 249template <> 250ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch) 251{ 252 // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such. 253 return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF); 254} 255 256template <> 257ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch) 258{ 259 return ch == '\r' || ch == '\n'; 260} 261 262template <> 263ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch) 264{ 265 return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028; 266} 267 268template <typename T> 269inline unsigned char Lexer<T>::convertHex(int c1, int c2) 270{ 271 return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2); 272} 273 274template <typename T> 275inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4) 276{ 277 return (convertHex(c1, c2) << 8) | convertHex(c3, c4); 278} 279 280template <typename T> 281ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length) 282{ 283 return &m_arena->makeIdentifier(m_vm, characters, length); 284} 285 286template <typename T> 287ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length) 288{ 289 return &m_arena->makeIdentifier(m_vm, characters, length); 290} 291 292template <> 293ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar) 294{ 295 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length); 296} 297 298template <> 299ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars) 300{ 301 if (!(orAllChars & ~0xff)) 302 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length); 303 304 return &m_arena->makeIdentifier(m_vm, characters, length); 305} 306 307template <> 308ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString) 309{ 310 ASSERT(sourceString->is8Bit()); 311 m_codeStart = sourceString->characters8(); 312} 313 314template <> 315ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString) 316{ 317 ASSERT(!sourceString->is8Bit()); 318 m_codeStart = sourceString->characters16(); 319} 320 321template <typename T> 322ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length) 323{ 324 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length); 325} 326 327template <typename T> 328ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length) 329{ 330 return &m_arena->makeIdentifier(m_vm, characters, length); 331} 332 333template <typename T> 334ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length) 335{ 336 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length); 337} 338 339#if ASSERT_DISABLED 340ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; } 341#else 342bool isSafeBuiltinIdentifier(VM&, const Identifier*); 343#endif 344 345template <typename T> 346ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode) 347{ 348 JSTokenData* tokenData = &tokenRecord->m_data; 349 JSTokenLocation* tokenLocation = &tokenRecord->m_location; 350 ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords)); 351 const T* start = m_code; 352 const T* ptr = start; 353 const T* end = m_codeEnd; 354 JSTextPosition startPosition = currentPosition(); 355 if (ptr >= end) { 356 ASSERT(ptr == end); 357 goto slowCase; 358 } 359 if (!WTF::isASCIIAlpha(*ptr)) 360 goto slowCase; 361 ++ptr; 362 while (ptr < end) { 363 if (!WTF::isASCIIAlphanumeric(*ptr)) 364 break; 365 ++ptr; 366 } 367 368 // Here's the shift 369 if (ptr < end) { 370 if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$')) 371 goto slowCase; 372 m_current = *ptr; 373 } else 374 m_current = 0; 375 376 m_code = ptr; 377 ASSERT(currentOffset() >= currentLineStartOffset()); 378 379 // Create the identifier if needed 380 if (lexerFlags & LexexFlagsDontBuildKeywords 381#if !ASSERT_DISABLED 382 && !m_parsingBuiltinFunction 383#endif 384 ) 385 tokenData->ident = 0; 386 else 387 tokenData->ident = makeLCharIdentifier(start, ptr - start); 388 389 tokenLocation->line = m_lineNumber; 390 tokenLocation->lineStartOffset = currentLineStartOffset(); 391 tokenLocation->startOffset = offsetFromSourcePtr(start); 392 tokenLocation->endOffset = currentOffset(); 393 ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset); 394 tokenRecord->m_startPosition = startPosition; 395 tokenRecord->m_endPosition = currentPosition(); 396#if !ASSERT_DISABLED 397 if (m_parsingBuiltinFunction) { 398 if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident)) 399 return ERRORTOK; 400 } 401#endif 402 403 m_lastToken = IDENT; 404 return IDENT; 405 406slowCase: 407 return lex(tokenRecord, lexerFlags, strictMode); 408} 409 410} // namespace JSC 411 412#endif // Lexer_h 413