1/*
2 *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
4 *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
5 *
6 *  This library is free software; you can redistribute it and/or
7 *  modify it under the terms of the GNU Library General Public
8 *  License as published by the Free Software Foundation; either
9 *  version 2 of the License, or (at your option) any later version.
10 *
11 *  This library is distributed in the hope that it will be useful,
12 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 *  Library General Public License for more details.
15 *
16 *  You should have received a copy of the GNU Library General Public License
17 *  along with this library; see the file COPYING.LIB.  If not, write to
18 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 *  Boston, MA 02110-1301, USA.
20 *
21 */
22
23#ifndef Lexer_h
24#define Lexer_h
25
26#include "Lookup.h"
27#include "ParserArena.h"
28#include "ParserTokens.h"
29#include "SourceCode.h"
30#include <wtf/ASCIICType.h>
31#include <wtf/SegmentedVector.h>
32#include <wtf/Vector.h>
33
34namespace JSC {
35
36class Keywords {
37public:
38    bool isKeyword(const Identifier& ident) const
39    {
40        return m_keywordTable.entry(m_vm, ident);
41    }
42
43    const HashTableValue* getKeyword(const Identifier& ident) const
44    {
45        return m_keywordTable.entry(m_vm, ident);
46    }
47
48    ~Keywords()
49    {
50        m_keywordTable.deleteTable();
51    }
52
53private:
54    friend class VM;
55
56    explicit Keywords(VM&);
57
58    VM& m_vm;
59    const HashTable m_keywordTable;
60};
61
62enum LexerFlags {
63    LexerFlagsIgnoreReservedWords = 1,
64    LexerFlagsDontBuildStrings = 2,
65    LexexFlagsDontBuildKeywords = 4
66};
67
68template <typename T>
69class Lexer {
70    WTF_MAKE_NONCOPYABLE(Lexer);
71    WTF_MAKE_FAST_ALLOCATED;
72
73public:
74    Lexer(VM*, JSParserStrictness);
75    ~Lexer();
76
77    // Character manipulation functions.
78    static bool isWhiteSpace(T character);
79    static bool isLineTerminator(T character);
80    static unsigned char convertHex(int c1, int c2);
81    static UChar convertUnicode(int c1, int c2, int c3, int c4);
82
83    // Functions to set up parsing.
84    void setCode(const SourceCode&, ParserArena*);
85    void setIsReparsing() { m_isReparsing = true; }
86    bool isReparsing() const { return m_isReparsing; }
87
88    JSTokenType lex(JSToken*, unsigned, bool strictMode);
89    bool nextTokenIsColon();
90    int lineNumber() const { return m_lineNumber; }
91    ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
92    ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
93    ALWAYS_INLINE JSTextPosition currentPosition() const
94    {
95        return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
96    }
97    JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
98    void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
99    int lastLineNumber() const { return m_lastLineNumber; }
100    bool prevTerminator() const { return m_terminator; }
101    bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
102    bool skipRegExp();
103
104    // Functions for use after parsing.
105    bool sawError() const { return m_error; }
106    String getErrorMessage() const { return m_lexErrorMessage; }
107    void clear();
108    void setOffset(int offset, int lineStartOffset)
109    {
110        m_error = 0;
111        m_lexErrorMessage = String();
112
113        m_code = sourcePtrFromOffset(offset);
114        m_lineStart = sourcePtrFromOffset(lineStartOffset);
115        ASSERT(currentOffset() >= currentLineStartOffset());
116
117        m_buffer8.resize(0);
118        m_buffer16.resize(0);
119        if (LIKELY(m_code < m_codeEnd))
120            m_current = *m_code;
121        else
122            m_current = 0;
123    }
124    void setLineNumber(int line)
125    {
126        m_lineNumber = line;
127    }
128
129    SourceProvider* sourceProvider() const { return m_source->provider(); }
130
131    JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
132
133private:
134    void record8(int);
135    void append8(const T*, size_t);
136    void record16(int);
137    void record16(T);
138    void append16(const LChar*, size_t);
139    void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
140
141    ALWAYS_INLINE void shift();
142    ALWAYS_INLINE bool atEnd() const;
143    ALWAYS_INLINE T peek(int offset) const;
144    struct UnicodeHexValue {
145
146        enum ValueType { ValidHex, IncompleteHex, InvalidHex };
147
148        explicit UnicodeHexValue(int value)
149            : m_value(value)
150        {
151        }
152        explicit UnicodeHexValue(ValueType type)
153            : m_value(type == IncompleteHex ? -2 : -1)
154        {
155        }
156
157        ValueType valueType() const
158        {
159            if (m_value >= 0)
160                return ValidHex;
161            return m_value == -2 ? IncompleteHex : InvalidHex;
162        }
163        bool isValid() const { return m_value >= 0; }
164        int value() const
165        {
166            ASSERT(m_value >= 0);
167            return m_value;
168        }
169
170    private:
171        int m_value;
172    };
173    UnicodeHexValue parseFourDigitUnicodeHex();
174    void shiftLineTerminator();
175
176    ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
177    ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
178
179    String invalidCharacterMessage() const;
180    ALWAYS_INLINE const T* currentSourcePtr() const;
181    ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
182
183    ALWAYS_INLINE void setCodeStart(const StringImpl*);
184
185    ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
186    ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
187    ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
188    ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
189    ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
190    ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
191
192    ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
193
194    template <int shiftAmount> void internalShift();
195    template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
196    template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
197    template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
198    enum StringParseResult {
199        StringParsedSuccessfully,
200        StringUnterminated,
201        StringCannotBeParsed
202    };
203    template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
204    template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
205    ALWAYS_INLINE void parseHex(double& returnValue);
206    ALWAYS_INLINE bool parseOctal(double& returnValue);
207    ALWAYS_INLINE bool parseDecimal(double& returnValue);
208    ALWAYS_INLINE void parseNumberAfterDecimalPoint();
209    ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
210    ALWAYS_INLINE bool parseMultilineComment();
211
212    static const size_t initialReadBufferCapacity = 32;
213
214    int m_lineNumber;
215    int m_lastLineNumber;
216
217    Vector<LChar> m_buffer8;
218    Vector<UChar> m_buffer16;
219    bool m_terminator;
220    int m_lastToken;
221
222    const SourceCode* m_source;
223    unsigned m_sourceOffset;
224    const T* m_code;
225    const T* m_codeStart;
226    const T* m_codeEnd;
227    const T* m_codeStartPlusOffset;
228    const T* m_lineStart;
229    JSTextPosition m_positionBeforeLastNewline;
230    bool m_isReparsing;
231    bool m_atLineStart;
232    bool m_error;
233    String m_lexErrorMessage;
234
235    T m_current;
236
237    IdentifierArena* m_arena;
238
239    VM* m_vm;
240    bool m_parsingBuiltinFunction;
241};
242
243template <>
244ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
245{
246    return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
247}
248
249template <>
250ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
251{
252    // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such.
253    return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF);
254}
255
256template <>
257ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
258{
259    return ch == '\r' || ch == '\n';
260}
261
262template <>
263ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
264{
265    return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
266}
267
268template <typename T>
269inline unsigned char Lexer<T>::convertHex(int c1, int c2)
270{
271    return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
272}
273
274template <typename T>
275inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
276{
277    return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
278}
279
280template <typename T>
281ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
282{
283    return &m_arena->makeIdentifier(m_vm, characters, length);
284}
285
286template <typename T>
287ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
288{
289    return &m_arena->makeIdentifier(m_vm, characters, length);
290}
291
292template <>
293ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
294{
295    return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
296}
297
298template <>
299ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
300{
301    if (!(orAllChars & ~0xff))
302        return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
303
304    return &m_arena->makeIdentifier(m_vm, characters, length);
305}
306
307template <>
308ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
309{
310    ASSERT(sourceString->is8Bit());
311    m_codeStart = sourceString->characters8();
312}
313
314template <>
315ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
316{
317    ASSERT(!sourceString->is8Bit());
318    m_codeStart = sourceString->characters16();
319}
320
321template <typename T>
322ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
323{
324    return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
325}
326
327template <typename T>
328ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
329{
330    return &m_arena->makeIdentifier(m_vm, characters, length);
331}
332
333template <typename T>
334ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
335{
336    return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
337}
338
339#if ASSERT_DISABLED
340ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
341#else
342bool isSafeBuiltinIdentifier(VM&, const Identifier*);
343#endif
344
345template <typename T>
346ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
347{
348    JSTokenData* tokenData = &tokenRecord->m_data;
349    JSTokenLocation* tokenLocation = &tokenRecord->m_location;
350    ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
351    const T* start = m_code;
352    const T* ptr = start;
353    const T* end = m_codeEnd;
354    JSTextPosition startPosition = currentPosition();
355    if (ptr >= end) {
356        ASSERT(ptr == end);
357        goto slowCase;
358    }
359    if (!WTF::isASCIIAlpha(*ptr))
360        goto slowCase;
361    ++ptr;
362    while (ptr < end) {
363        if (!WTF::isASCIIAlphanumeric(*ptr))
364            break;
365        ++ptr;
366    }
367
368    // Here's the shift
369    if (ptr < end) {
370        if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
371            goto slowCase;
372        m_current = *ptr;
373    } else
374        m_current = 0;
375
376    m_code = ptr;
377    ASSERT(currentOffset() >= currentLineStartOffset());
378
379    // Create the identifier if needed
380    if (lexerFlags & LexexFlagsDontBuildKeywords
381#if !ASSERT_DISABLED
382        && !m_parsingBuiltinFunction
383#endif
384        )
385        tokenData->ident = 0;
386    else
387        tokenData->ident = makeLCharIdentifier(start, ptr - start);
388
389    tokenLocation->line = m_lineNumber;
390    tokenLocation->lineStartOffset = currentLineStartOffset();
391    tokenLocation->startOffset = offsetFromSourcePtr(start);
392    tokenLocation->endOffset = currentOffset();
393    ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
394    tokenRecord->m_startPosition = startPosition;
395    tokenRecord->m_endPosition = currentPosition();
396#if !ASSERT_DISABLED
397    if (m_parsingBuiltinFunction) {
398        if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
399            return ERRORTOK;
400    }
401#endif
402
403    m_lastToken = IDENT;
404    return IDENT;
405
406slowCase:
407    return lex(tokenRecord, lexerFlags, strictMode);
408}
409
410} // namespace JSC
411
412#endif // Lexer_h
413