1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifndef HTMLTokenizer_h
28#define HTMLTokenizer_h
29
30#include "HTMLParserOptions.h"
31#include "HTMLToken.h"
32#include "InputStreamPreprocessor.h"
33#include "SegmentedString.h"
34
35namespace WebCore {
36
37class HTMLTokenizer {
38    WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
39    WTF_MAKE_FAST_ALLOCATED;
40public:
41    explicit HTMLTokenizer(const HTMLParserOptions&);
42    ~HTMLTokenizer();
43
44    void reset();
45
46    enum State {
47        DataState,
48        CharacterReferenceInDataState,
49        RCDATAState,
50        CharacterReferenceInRCDATAState,
51        RAWTEXTState,
52        ScriptDataState,
53        PLAINTEXTState,
54        TagOpenState,
55        EndTagOpenState,
56        TagNameState,
57        RCDATALessThanSignState,
58        RCDATAEndTagOpenState,
59        RCDATAEndTagNameState,
60        RAWTEXTLessThanSignState,
61        RAWTEXTEndTagOpenState,
62        RAWTEXTEndTagNameState,
63        ScriptDataLessThanSignState,
64        ScriptDataEndTagOpenState,
65        ScriptDataEndTagNameState,
66        ScriptDataEscapeStartState,
67        ScriptDataEscapeStartDashState,
68        ScriptDataEscapedState,
69        ScriptDataEscapedDashState,
70        ScriptDataEscapedDashDashState,
71        ScriptDataEscapedLessThanSignState,
72        ScriptDataEscapedEndTagOpenState,
73        ScriptDataEscapedEndTagNameState,
74        ScriptDataDoubleEscapeStartState,
75        ScriptDataDoubleEscapedState,
76        ScriptDataDoubleEscapedDashState,
77        ScriptDataDoubleEscapedDashDashState,
78        ScriptDataDoubleEscapedLessThanSignState,
79        ScriptDataDoubleEscapeEndState,
80        BeforeAttributeNameState,
81        AttributeNameState,
82        AfterAttributeNameState,
83        BeforeAttributeValueState,
84        AttributeValueDoubleQuotedState,
85        AttributeValueSingleQuotedState,
86        AttributeValueUnquotedState,
87        CharacterReferenceInAttributeValueState,
88        AfterAttributeValueQuotedState,
89        SelfClosingStartTagState,
90        BogusCommentState,
91        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
92        // it internally to keep track of whether we've started the bogus
93        // comment token yet.
94        ContinueBogusCommentState,
95        MarkupDeclarationOpenState,
96        CommentStartState,
97        CommentStartDashState,
98        CommentState,
99        CommentEndDashState,
100        CommentEndState,
101        CommentEndBangState,
102        DOCTYPEState,
103        BeforeDOCTYPENameState,
104        DOCTYPENameState,
105        AfterDOCTYPENameState,
106        AfterDOCTYPEPublicKeywordState,
107        BeforeDOCTYPEPublicIdentifierState,
108        DOCTYPEPublicIdentifierDoubleQuotedState,
109        DOCTYPEPublicIdentifierSingleQuotedState,
110        AfterDOCTYPEPublicIdentifierState,
111        BetweenDOCTYPEPublicAndSystemIdentifiersState,
112        AfterDOCTYPESystemKeywordState,
113        BeforeDOCTYPESystemIdentifierState,
114        DOCTYPESystemIdentifierDoubleQuotedState,
115        DOCTYPESystemIdentifierSingleQuotedState,
116        AfterDOCTYPESystemIdentifierState,
117        BogusDOCTYPEState,
118        CDATASectionState,
119        // These CDATA states are not in the HTML5 spec, but we use them internally.
120        CDATASectionRightSquareBracketState,
121        CDATASectionDoubleRightSquareBracketState,
122    };
123
124    // This function returns true if it emits a token. Otherwise, callers
125    // must provide the same (in progress) token on the next call (unless
126    // they call reset() first).
127    bool nextToken(SegmentedString&, HTMLToken&);
128
129    // Returns a copy of any characters buffered internally by the tokenizer.
130    // The tokenizer buffers characters when searching for the </script> token
131    // that terminates a script element.
132    String bufferedCharacters() const;
133
134    size_t numberOfBufferedCharacters() const
135    {
136        // Notice that we add 2 to the length of the m_temporaryBuffer to
137        // account for the "</" characters, which are effecitvely buffered in
138        // the tokenizer's state machine.
139        return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
140    }
141
142    // Updates the tokenizer's state according to the given tag name. This is
143    // an approximation of how the tree builder would update the tokenizer's
144    // state. This method is useful for approximating HTML tokenization. To
145    // get exactly the correct tokenization, you need the real tree builder.
146    //
147    // The main failures in the approximation are as follows:
148    //
149    //  * The first set of character tokens emitted for a <pre> element might
150    //    contain an extra leading newline.
151    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
152    //    tree builder's insertion mode.
153    //  * CDATA sections in foreign content will be tokenized as bogus comments
154    //    instead of as character tokens.
155    //
156    void updateStateFor(const AtomicString& tagName);
157
158    bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
159    void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
160
161    bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
162    void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
163
164    State state() const { return m_state; }
165    void setState(State state) { m_state = state; }
166
167    inline bool shouldSkipNullCharacters() const
168    {
169        return !m_forceNullCharacterReplacement
170            && (m_state == HTMLTokenizer::DataState
171                || m_state == HTMLTokenizer::RCDATAState
172                || m_state == HTMLTokenizer::RAWTEXTState);
173    }
174
175private:
176    inline bool processEntity(SegmentedString&);
177
178    inline void parseError();
179
180    inline void bufferCharacter(UChar character)
181    {
182        ASSERT(character != kEndOfFileMarker);
183        m_token->ensureIsCharacterToken();
184        m_token->appendToCharacter(character);
185    }
186
187    inline bool emitAndResumeIn(SegmentedString& source, State state)
188    {
189        saveEndTagNameIfNeeded();
190        m_state = state;
191        source.advanceAndUpdateLineNumber();
192        return true;
193    }
194
195    inline bool emitAndReconsumeIn(SegmentedString&, State state)
196    {
197        saveEndTagNameIfNeeded();
198        m_state = state;
199        return true;
200    }
201
202    inline bool emitEndOfFile(SegmentedString& source)
203    {
204        if (haveBufferedCharacterToken())
205            return true;
206        m_state = HTMLTokenizer::DataState;
207        source.advanceAndUpdateLineNumber();
208        m_token->clear();
209        m_token->makeEndOfFile();
210        return true;
211    }
212
213    inline bool flushEmitAndResumeIn(SegmentedString&, State);
214
215    // Return whether we need to emit a character token before dealing with
216    // the buffered end tag.
217    inline bool flushBufferedEndTag(SegmentedString&);
218    inline bool temporaryBufferIs(const String&);
219
220    // Sometimes we speculatively consume input characters and we don't
221    // know whether they represent end tags or RCDATA, etc. These
222    // functions help manage these state.
223    inline void addToPossibleEndTag(LChar cc);
224
225    inline void saveEndTagNameIfNeeded()
226    {
227        ASSERT(m_token->type() != HTMLToken::Uninitialized);
228        if (m_token->type() == HTMLToken::StartTag)
229            m_appropriateEndTagName = m_token->name();
230    }
231    inline bool isAppropriateEndTag();
232
233
234    inline bool haveBufferedCharacterToken()
235    {
236        return m_token->type() == HTMLToken::Character;
237    }
238
239    State m_state;
240    bool m_forceNullCharacterReplacement;
241    bool m_shouldAllowCDATA;
242
243    // m_token is owned by the caller. If nextToken is not on the stack,
244    // this member might be pointing to unallocated memory.
245    HTMLToken* m_token;
246
247    // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
248    UChar m_additionalAllowedCharacter;
249
250    // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
251    InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
252
253    Vector<UChar, 32> m_appropriateEndTagName;
254
255    // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
256    Vector<LChar, 32> m_temporaryBuffer;
257
258    // We occationally want to emit both a character token and an end tag
259    // token (e.g., when lexing script). We buffer the name of the end tag
260    // token here so we remember it next time we re-enter the tokenizer.
261    Vector<LChar, 32> m_bufferedEndTagName;
262
263    HTMLParserOptions m_options;
264};
265
266}
267
268#endif
269