1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLTokenizer.h"
30
31#include "HTMLEntityParser.h"
32#include "HTMLTreeBuilder.h"
33#include "MarkupTokenizerInlines.h"
34#include "NotImplemented.h"
35#include <wtf/ASCIICType.h>
36#include <wtf/CurrentTime.h>
37#include <wtf/text/CString.h>
38
39using namespace WTF;
40
41namespace WebCore {
42
43using namespace HTMLNames;
44
45// This has to go in a .cpp file, as the linker doesn't like it being included more than once.
46// We don't have an HTMLToken.cpp though, so this is the next best place.
47QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
48{
49    return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
50}
51
52bool AtomicHTMLToken::usesName() const
53{
54    return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
55}
56
57bool AtomicHTMLToken::usesAttributes() const
58{
59    return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
60}
61
62static inline UChar toLowerCase(UChar cc)
63{
64    ASSERT(isASCIIUpper(cc));
65    const int lowerCaseOffset = 0x20;
66    return cc + lowerCaseOffset;
67}
68
69static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
70{
71    if (vector.size() != string.length())
72        return false;
73
74    if (!string.length())
75        return true;
76
77    return equal(string.impl(), vector.data(), vector.size());
78}
79
80static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
81{
82    switch (state) {
83    case HTMLTokenizer::RCDATAEndTagOpenState:
84    case HTMLTokenizer::RCDATAEndTagNameState:
85    case HTMLTokenizer::RAWTEXTEndTagOpenState:
86    case HTMLTokenizer::RAWTEXTEndTagNameState:
87    case HTMLTokenizer::ScriptDataEndTagOpenState:
88    case HTMLTokenizer::ScriptDataEndTagNameState:
89    case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
90    case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
91        return true;
92    default:
93        return false;
94    }
95}
96
97#define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
98#define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
99#define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
100#define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
101
102HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
103    : m_inputStreamPreprocessor(this)
104    , m_options(options)
105{
106    reset();
107}
108
109HTMLTokenizer::~HTMLTokenizer()
110{
111}
112
113void HTMLTokenizer::reset()
114{
115    m_state = HTMLTokenizer::DataState;
116    m_token = 0;
117    m_forceNullCharacterReplacement = false;
118    m_shouldAllowCDATA = false;
119    m_additionalAllowedCharacter = '\0';
120}
121
122inline bool HTMLTokenizer::processEntity(SegmentedString& source)
123{
124    bool notEnoughCharacters = false;
125    StringBuilder decodedEntity;
126    bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
127    if (notEnoughCharacters)
128        return false;
129    if (!success) {
130        ASSERT(decodedEntity.isEmpty());
131        bufferCharacter('&');
132    } else {
133        for (unsigned i = 0; i < decodedEntity.length(); ++i)
134            bufferCharacter(decodedEntity[i]);
135    }
136    return true;
137}
138
139bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
140{
141    ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
142    source.advanceAndUpdateLineNumber();
143    if (m_token->type() == HTMLToken::Character)
144        return true;
145    m_token->beginEndTag(m_bufferedEndTagName);
146    m_bufferedEndTagName.clear();
147    m_appropriateEndTagName.clear();
148    m_temporaryBuffer.clear();
149    return false;
150}
151
152#define FLUSH_AND_ADVANCE_TO(stateName)                                    \
153    do {                                                                   \
154        m_state = HTMLTokenizer::stateName;                           \
155        if (flushBufferedEndTag(source))                                   \
156            return true;                                                   \
157        if (source.isEmpty()                                               \
158            || !m_inputStreamPreprocessor.peek(source))                    \
159            return haveBufferedCharacterToken();                           \
160        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
161        goto stateName;                                                    \
162    } while (false)
163
164bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
165{
166    m_state = state;
167    flushBufferedEndTag(source);
168    return true;
169}
170
171bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
172{
173    // If we have a token in progress, then we're supposed to be called back
174    // with the same token so we can finish it.
175    ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
176    m_token = &token;
177
178    if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
179        // FIXME: This should call flushBufferedEndTag().
180        // We started an end tag during our last iteration.
181        m_token->beginEndTag(m_bufferedEndTagName);
182        m_bufferedEndTagName.clear();
183        m_appropriateEndTagName.clear();
184        m_temporaryBuffer.clear();
185        if (m_state == HTMLTokenizer::DataState) {
186            // We're back in the data state, so we must be done with the tag.
187            return true;
188        }
189    }
190
191    if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
192        return haveBufferedCharacterToken();
193    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
194
195    // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
196    switch (m_state) {
197    HTML_BEGIN_STATE(DataState) {
198        if (cc == '&')
199            HTML_ADVANCE_TO(CharacterReferenceInDataState);
200        else if (cc == '<') {
201            if (m_token->type() == HTMLToken::Character) {
202                // We have a bunch of character tokens queued up that we
203                // are emitting lazily here.
204                return true;
205            }
206            HTML_ADVANCE_TO(TagOpenState);
207        } else if (cc == kEndOfFileMarker)
208            return emitEndOfFile(source);
209        else {
210            bufferCharacter(cc);
211            HTML_ADVANCE_TO(DataState);
212        }
213    }
214    END_STATE()
215
216    HTML_BEGIN_STATE(CharacterReferenceInDataState) {
217        if (!processEntity(source))
218            return haveBufferedCharacterToken();
219        HTML_SWITCH_TO(DataState);
220    }
221    END_STATE()
222
223    HTML_BEGIN_STATE(RCDATAState) {
224        if (cc == '&')
225            HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
226        else if (cc == '<')
227            HTML_ADVANCE_TO(RCDATALessThanSignState);
228        else if (cc == kEndOfFileMarker)
229            return emitEndOfFile(source);
230        else {
231            bufferCharacter(cc);
232            HTML_ADVANCE_TO(RCDATAState);
233        }
234    }
235    END_STATE()
236
237    HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
238        if (!processEntity(source))
239            return haveBufferedCharacterToken();
240        HTML_SWITCH_TO(RCDATAState);
241    }
242    END_STATE()
243
244    HTML_BEGIN_STATE(RAWTEXTState) {
245        if (cc == '<')
246            HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
247        else if (cc == kEndOfFileMarker)
248            return emitEndOfFile(source);
249        else {
250            bufferCharacter(cc);
251            HTML_ADVANCE_TO(RAWTEXTState);
252        }
253    }
254    END_STATE()
255
256    HTML_BEGIN_STATE(ScriptDataState) {
257        if (cc == '<')
258            HTML_ADVANCE_TO(ScriptDataLessThanSignState);
259        else if (cc == kEndOfFileMarker)
260            return emitEndOfFile(source);
261        else {
262            bufferCharacter(cc);
263            HTML_ADVANCE_TO(ScriptDataState);
264        }
265    }
266    END_STATE()
267
268    HTML_BEGIN_STATE(PLAINTEXTState) {
269        if (cc == kEndOfFileMarker)
270            return emitEndOfFile(source);
271        bufferCharacter(cc);
272        HTML_ADVANCE_TO(PLAINTEXTState);
273    }
274    END_STATE()
275
276    HTML_BEGIN_STATE(TagOpenState) {
277        if (cc == '!')
278            HTML_ADVANCE_TO(MarkupDeclarationOpenState);
279        else if (cc == '/')
280            HTML_ADVANCE_TO(EndTagOpenState);
281        else if (isASCIIUpper(cc)) {
282            m_token->beginStartTag(toLowerCase(cc));
283            HTML_ADVANCE_TO(TagNameState);
284        } else if (isASCIILower(cc)) {
285            m_token->beginStartTag(cc);
286            HTML_ADVANCE_TO(TagNameState);
287        } else if (cc == '?') {
288            parseError();
289            // The spec consumes the current character before switching
290            // to the bogus comment state, but it's easier to implement
291            // if we reconsume the current character.
292            HTML_RECONSUME_IN(BogusCommentState);
293        } else {
294            parseError();
295            bufferCharacter('<');
296            HTML_RECONSUME_IN(DataState);
297        }
298    }
299    END_STATE()
300
301    HTML_BEGIN_STATE(EndTagOpenState) {
302        if (isASCIIUpper(cc)) {
303            m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
304            m_appropriateEndTagName.clear();
305            HTML_ADVANCE_TO(TagNameState);
306        } else if (isASCIILower(cc)) {
307            m_token->beginEndTag(static_cast<LChar>(cc));
308            m_appropriateEndTagName.clear();
309            HTML_ADVANCE_TO(TagNameState);
310        } else if (cc == '>') {
311            parseError();
312            HTML_ADVANCE_TO(DataState);
313        } else if (cc == kEndOfFileMarker) {
314            parseError();
315            bufferCharacter('<');
316            bufferCharacter('/');
317            HTML_RECONSUME_IN(DataState);
318        } else {
319            parseError();
320            HTML_RECONSUME_IN(BogusCommentState);
321        }
322    }
323    END_STATE()
324
325    HTML_BEGIN_STATE(TagNameState) {
326        if (isTokenizerWhitespace(cc))
327            HTML_ADVANCE_TO(BeforeAttributeNameState);
328        else if (cc == '/')
329            HTML_ADVANCE_TO(SelfClosingStartTagState);
330        else if (cc == '>')
331            return emitAndResumeIn(source, HTMLTokenizer::DataState);
332        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
333            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
334        else if (isASCIIUpper(cc)) {
335            m_token->appendToName(toLowerCase(cc));
336            HTML_ADVANCE_TO(TagNameState);
337        } else if (cc == kEndOfFileMarker) {
338            parseError();
339            HTML_RECONSUME_IN(DataState);
340        } else {
341            m_token->appendToName(cc);
342            HTML_ADVANCE_TO(TagNameState);
343        }
344    }
345    END_STATE()
346
347    HTML_BEGIN_STATE(RCDATALessThanSignState) {
348        if (cc == '/') {
349            m_temporaryBuffer.clear();
350            ASSERT(m_bufferedEndTagName.isEmpty());
351            HTML_ADVANCE_TO(RCDATAEndTagOpenState);
352        } else {
353            bufferCharacter('<');
354            HTML_RECONSUME_IN(RCDATAState);
355        }
356    }
357    END_STATE()
358
359    HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
360        if (isASCIIUpper(cc)) {
361            m_temporaryBuffer.append(static_cast<LChar>(cc));
362            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
363            HTML_ADVANCE_TO(RCDATAEndTagNameState);
364        } else if (isASCIILower(cc)) {
365            m_temporaryBuffer.append(static_cast<LChar>(cc));
366            addToPossibleEndTag(static_cast<LChar>(cc));
367            HTML_ADVANCE_TO(RCDATAEndTagNameState);
368        } else {
369            bufferCharacter('<');
370            bufferCharacter('/');
371            HTML_RECONSUME_IN(RCDATAState);
372        }
373    }
374    END_STATE()
375
376    HTML_BEGIN_STATE(RCDATAEndTagNameState) {
377        if (isASCIIUpper(cc)) {
378            m_temporaryBuffer.append(static_cast<LChar>(cc));
379            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
380            HTML_ADVANCE_TO(RCDATAEndTagNameState);
381        } else if (isASCIILower(cc)) {
382            m_temporaryBuffer.append(static_cast<LChar>(cc));
383            addToPossibleEndTag(static_cast<LChar>(cc));
384            HTML_ADVANCE_TO(RCDATAEndTagNameState);
385        } else {
386            if (isTokenizerWhitespace(cc)) {
387                if (isAppropriateEndTag()) {
388                    m_temporaryBuffer.append(static_cast<LChar>(cc));
389                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
390                }
391            } else if (cc == '/') {
392                if (isAppropriateEndTag()) {
393                    m_temporaryBuffer.append(static_cast<LChar>(cc));
394                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
395                }
396            } else if (cc == '>') {
397                if (isAppropriateEndTag()) {
398                    m_temporaryBuffer.append(static_cast<LChar>(cc));
399                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
400                }
401            }
402            bufferCharacter('<');
403            bufferCharacter('/');
404            m_token->appendToCharacter(m_temporaryBuffer);
405            m_bufferedEndTagName.clear();
406            m_temporaryBuffer.clear();
407            HTML_RECONSUME_IN(RCDATAState);
408        }
409    }
410    END_STATE()
411
412    HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
413        if (cc == '/') {
414            m_temporaryBuffer.clear();
415            ASSERT(m_bufferedEndTagName.isEmpty());
416            HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
417        } else {
418            bufferCharacter('<');
419            HTML_RECONSUME_IN(RAWTEXTState);
420        }
421    }
422    END_STATE()
423
424    HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
425        if (isASCIIUpper(cc)) {
426            m_temporaryBuffer.append(static_cast<LChar>(cc));
427            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
428            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
429        } else if (isASCIILower(cc)) {
430            m_temporaryBuffer.append(static_cast<LChar>(cc));
431            addToPossibleEndTag(static_cast<LChar>(cc));
432            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
433        } else {
434            bufferCharacter('<');
435            bufferCharacter('/');
436            HTML_RECONSUME_IN(RAWTEXTState);
437        }
438    }
439    END_STATE()
440
441    HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
442        if (isASCIIUpper(cc)) {
443            m_temporaryBuffer.append(static_cast<LChar>(cc));
444            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
445            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
446        } else if (isASCIILower(cc)) {
447            m_temporaryBuffer.append(static_cast<LChar>(cc));
448            addToPossibleEndTag(static_cast<LChar>(cc));
449            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
450        } else {
451            if (isTokenizerWhitespace(cc)) {
452                if (isAppropriateEndTag()) {
453                    m_temporaryBuffer.append(static_cast<LChar>(cc));
454                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
455                }
456            } else if (cc == '/') {
457                if (isAppropriateEndTag()) {
458                    m_temporaryBuffer.append(static_cast<LChar>(cc));
459                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
460                }
461            } else if (cc == '>') {
462                if (isAppropriateEndTag()) {
463                    m_temporaryBuffer.append(static_cast<LChar>(cc));
464                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
465                }
466            }
467            bufferCharacter('<');
468            bufferCharacter('/');
469            m_token->appendToCharacter(m_temporaryBuffer);
470            m_bufferedEndTagName.clear();
471            m_temporaryBuffer.clear();
472            HTML_RECONSUME_IN(RAWTEXTState);
473        }
474    }
475    END_STATE()
476
477    HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
478        if (cc == '/') {
479            m_temporaryBuffer.clear();
480            ASSERT(m_bufferedEndTagName.isEmpty());
481            HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
482        } else if (cc == '!') {
483            bufferCharacter('<');
484            bufferCharacter('!');
485            HTML_ADVANCE_TO(ScriptDataEscapeStartState);
486        } else {
487            bufferCharacter('<');
488            HTML_RECONSUME_IN(ScriptDataState);
489        }
490    }
491    END_STATE()
492
493    HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
494        if (isASCIIUpper(cc)) {
495            m_temporaryBuffer.append(static_cast<LChar>(cc));
496            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
497            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
498        } else if (isASCIILower(cc)) {
499            m_temporaryBuffer.append(static_cast<LChar>(cc));
500            addToPossibleEndTag(static_cast<LChar>(cc));
501            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
502        } else {
503            bufferCharacter('<');
504            bufferCharacter('/');
505            HTML_RECONSUME_IN(ScriptDataState);
506        }
507    }
508    END_STATE()
509
510    HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
511        if (isASCIIUpper(cc)) {
512            m_temporaryBuffer.append(static_cast<LChar>(cc));
513            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
514            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
515        } else if (isASCIILower(cc)) {
516            m_temporaryBuffer.append(static_cast<LChar>(cc));
517            addToPossibleEndTag(static_cast<LChar>(cc));
518            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
519        } else {
520            if (isTokenizerWhitespace(cc)) {
521                if (isAppropriateEndTag()) {
522                    m_temporaryBuffer.append(static_cast<LChar>(cc));
523                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
524                }
525            } else if (cc == '/') {
526                if (isAppropriateEndTag()) {
527                    m_temporaryBuffer.append(static_cast<LChar>(cc));
528                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
529                }
530            } else if (cc == '>') {
531                if (isAppropriateEndTag()) {
532                    m_temporaryBuffer.append(static_cast<LChar>(cc));
533                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
534                }
535            }
536            bufferCharacter('<');
537            bufferCharacter('/');
538            m_token->appendToCharacter(m_temporaryBuffer);
539            m_bufferedEndTagName.clear();
540            m_temporaryBuffer.clear();
541            HTML_RECONSUME_IN(ScriptDataState);
542        }
543    }
544    END_STATE()
545
546    HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
547        if (cc == '-') {
548            bufferCharacter(cc);
549            HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
550        } else
551            HTML_RECONSUME_IN(ScriptDataState);
552    }
553    END_STATE()
554
555    HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
556        if (cc == '-') {
557            bufferCharacter(cc);
558            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
559        } else
560            HTML_RECONSUME_IN(ScriptDataState);
561    }
562    END_STATE()
563
564    HTML_BEGIN_STATE(ScriptDataEscapedState) {
565        if (cc == '-') {
566            bufferCharacter(cc);
567            HTML_ADVANCE_TO(ScriptDataEscapedDashState);
568        } else if (cc == '<')
569            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
570        else if (cc == kEndOfFileMarker) {
571            parseError();
572            HTML_RECONSUME_IN(DataState);
573        } else {
574            bufferCharacter(cc);
575            HTML_ADVANCE_TO(ScriptDataEscapedState);
576        }
577    }
578    END_STATE()
579
580    HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
581        if (cc == '-') {
582            bufferCharacter(cc);
583            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
584        } else if (cc == '<')
585            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
586        else if (cc == kEndOfFileMarker) {
587            parseError();
588            HTML_RECONSUME_IN(DataState);
589        } else {
590            bufferCharacter(cc);
591            HTML_ADVANCE_TO(ScriptDataEscapedState);
592        }
593    }
594    END_STATE()
595
596    HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
597        if (cc == '-') {
598            bufferCharacter(cc);
599            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
600        } else if (cc == '<')
601            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
602        else if (cc == '>') {
603            bufferCharacter(cc);
604            HTML_ADVANCE_TO(ScriptDataState);
605        } else if (cc == kEndOfFileMarker) {
606            parseError();
607            HTML_RECONSUME_IN(DataState);
608        } else {
609            bufferCharacter(cc);
610            HTML_ADVANCE_TO(ScriptDataEscapedState);
611        }
612    }
613    END_STATE()
614
615    HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
616        if (cc == '/') {
617            m_temporaryBuffer.clear();
618            ASSERT(m_bufferedEndTagName.isEmpty());
619            HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
620        } else if (isASCIIUpper(cc)) {
621            bufferCharacter('<');
622            bufferCharacter(cc);
623            m_temporaryBuffer.clear();
624            m_temporaryBuffer.append(toLowerCase(cc));
625            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
626        } else if (isASCIILower(cc)) {
627            bufferCharacter('<');
628            bufferCharacter(cc);
629            m_temporaryBuffer.clear();
630            m_temporaryBuffer.append(static_cast<LChar>(cc));
631            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
632        } else {
633            bufferCharacter('<');
634            HTML_RECONSUME_IN(ScriptDataEscapedState);
635        }
636    }
637    END_STATE()
638
639    HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
640        if (isASCIIUpper(cc)) {
641            m_temporaryBuffer.append(static_cast<LChar>(cc));
642            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
643            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
644        } else if (isASCIILower(cc)) {
645            m_temporaryBuffer.append(static_cast<LChar>(cc));
646            addToPossibleEndTag(static_cast<LChar>(cc));
647            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
648        } else {
649            bufferCharacter('<');
650            bufferCharacter('/');
651            HTML_RECONSUME_IN(ScriptDataEscapedState);
652        }
653    }
654    END_STATE()
655
656    HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
657        if (isASCIIUpper(cc)) {
658            m_temporaryBuffer.append(static_cast<LChar>(cc));
659            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
660            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
661        } else if (isASCIILower(cc)) {
662            m_temporaryBuffer.append(static_cast<LChar>(cc));
663            addToPossibleEndTag(static_cast<LChar>(cc));
664            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
665        } else {
666            if (isTokenizerWhitespace(cc)) {
667                if (isAppropriateEndTag()) {
668                    m_temporaryBuffer.append(static_cast<LChar>(cc));
669                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
670                }
671            } else if (cc == '/') {
672                if (isAppropriateEndTag()) {
673                    m_temporaryBuffer.append(static_cast<LChar>(cc));
674                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
675                }
676            } else if (cc == '>') {
677                if (isAppropriateEndTag()) {
678                    m_temporaryBuffer.append(static_cast<LChar>(cc));
679                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
680                }
681            }
682            bufferCharacter('<');
683            bufferCharacter('/');
684            m_token->appendToCharacter(m_temporaryBuffer);
685            m_bufferedEndTagName.clear();
686            m_temporaryBuffer.clear();
687            HTML_RECONSUME_IN(ScriptDataEscapedState);
688        }
689    }
690    END_STATE()
691
692    HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
693        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
694            bufferCharacter(cc);
695            if (temporaryBufferIs(scriptTag.localName()))
696                HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
697            else
698                HTML_ADVANCE_TO(ScriptDataEscapedState);
699        } else if (isASCIIUpper(cc)) {
700            bufferCharacter(cc);
701            m_temporaryBuffer.append(toLowerCase(cc));
702            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
703        } else if (isASCIILower(cc)) {
704            bufferCharacter(cc);
705            m_temporaryBuffer.append(static_cast<LChar>(cc));
706            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
707        } else
708            HTML_RECONSUME_IN(ScriptDataEscapedState);
709    }
710    END_STATE()
711
712    HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
713        if (cc == '-') {
714            bufferCharacter(cc);
715            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
716        } else if (cc == '<') {
717            bufferCharacter(cc);
718            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
719        } else if (cc == kEndOfFileMarker) {
720            parseError();
721            HTML_RECONSUME_IN(DataState);
722        } else {
723            bufferCharacter(cc);
724            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
725        }
726    }
727    END_STATE()
728
729    HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
730        if (cc == '-') {
731            bufferCharacter(cc);
732            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
733        } else if (cc == '<') {
734            bufferCharacter(cc);
735            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
736        } else if (cc == kEndOfFileMarker) {
737            parseError();
738            HTML_RECONSUME_IN(DataState);
739        } else {
740            bufferCharacter(cc);
741            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
742        }
743    }
744    END_STATE()
745
746    HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
747        if (cc == '-') {
748            bufferCharacter(cc);
749            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
750        } else if (cc == '<') {
751            bufferCharacter(cc);
752            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
753        } else if (cc == '>') {
754            bufferCharacter(cc);
755            HTML_ADVANCE_TO(ScriptDataState);
756        } else if (cc == kEndOfFileMarker) {
757            parseError();
758            HTML_RECONSUME_IN(DataState);
759        } else {
760            bufferCharacter(cc);
761            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
762        }
763    }
764    END_STATE()
765
766    HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
767        if (cc == '/') {
768            bufferCharacter(cc);
769            m_temporaryBuffer.clear();
770            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
771        } else
772            HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
773    }
774    END_STATE()
775
776    HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
777        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
778            bufferCharacter(cc);
779            if (temporaryBufferIs(scriptTag.localName()))
780                HTML_ADVANCE_TO(ScriptDataEscapedState);
781            else
782                HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
783        } else if (isASCIIUpper(cc)) {
784            bufferCharacter(cc);
785            m_temporaryBuffer.append(toLowerCase(cc));
786            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
787        } else if (isASCIILower(cc)) {
788            bufferCharacter(cc);
789            m_temporaryBuffer.append(static_cast<LChar>(cc));
790            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
791        } else
792            HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
793    }
794    END_STATE()
795
796    HTML_BEGIN_STATE(BeforeAttributeNameState) {
797        if (isTokenizerWhitespace(cc))
798            HTML_ADVANCE_TO(BeforeAttributeNameState);
799        else if (cc == '/')
800            HTML_ADVANCE_TO(SelfClosingStartTagState);
801        else if (cc == '>')
802            return emitAndResumeIn(source, HTMLTokenizer::DataState);
803        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
804            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
805        else if (isASCIIUpper(cc)) {
806            m_token->addNewAttribute();
807            m_token->beginAttributeName(source.numberOfCharactersConsumed());
808            m_token->appendToAttributeName(toLowerCase(cc));
809            HTML_ADVANCE_TO(AttributeNameState);
810        } else if (cc == kEndOfFileMarker) {
811            parseError();
812            HTML_RECONSUME_IN(DataState);
813        } else {
814            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
815                parseError();
816            m_token->addNewAttribute();
817            m_token->beginAttributeName(source.numberOfCharactersConsumed());
818            m_token->appendToAttributeName(cc);
819            HTML_ADVANCE_TO(AttributeNameState);
820        }
821    }
822    END_STATE()
823
824    HTML_BEGIN_STATE(AttributeNameState) {
825        if (isTokenizerWhitespace(cc)) {
826            m_token->endAttributeName(source.numberOfCharactersConsumed());
827            HTML_ADVANCE_TO(AfterAttributeNameState);
828        } else if (cc == '/') {
829            m_token->endAttributeName(source.numberOfCharactersConsumed());
830            HTML_ADVANCE_TO(SelfClosingStartTagState);
831        } else if (cc == '=') {
832            m_token->endAttributeName(source.numberOfCharactersConsumed());
833            HTML_ADVANCE_TO(BeforeAttributeValueState);
834        } else if (cc == '>') {
835            m_token->endAttributeName(source.numberOfCharactersConsumed());
836            return emitAndResumeIn(source, HTMLTokenizer::DataState);
837        } else if (m_options.usePreHTML5ParserQuirks && cc == '<') {
838            m_token->endAttributeName(source.numberOfCharactersConsumed());
839            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
840        } else if (isASCIIUpper(cc)) {
841            m_token->appendToAttributeName(toLowerCase(cc));
842            HTML_ADVANCE_TO(AttributeNameState);
843        } else if (cc == kEndOfFileMarker) {
844            parseError();
845            m_token->endAttributeName(source.numberOfCharactersConsumed());
846            HTML_RECONSUME_IN(DataState);
847        } else {
848            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
849                parseError();
850            m_token->appendToAttributeName(cc);
851            HTML_ADVANCE_TO(AttributeNameState);
852        }
853    }
854    END_STATE()
855
856    HTML_BEGIN_STATE(AfterAttributeNameState) {
857        if (isTokenizerWhitespace(cc))
858            HTML_ADVANCE_TO(AfterAttributeNameState);
859        else if (cc == '/')
860            HTML_ADVANCE_TO(SelfClosingStartTagState);
861        else if (cc == '=')
862            HTML_ADVANCE_TO(BeforeAttributeValueState);
863        else if (cc == '>')
864            return emitAndResumeIn(source, HTMLTokenizer::DataState);
865        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
866            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
867        else if (isASCIIUpper(cc)) {
868            m_token->addNewAttribute();
869            m_token->beginAttributeName(source.numberOfCharactersConsumed());
870            m_token->appendToAttributeName(toLowerCase(cc));
871            HTML_ADVANCE_TO(AttributeNameState);
872        } else if (cc == kEndOfFileMarker) {
873            parseError();
874            HTML_RECONSUME_IN(DataState);
875        } else {
876            if (cc == '"' || cc == '\'' || cc == '<')
877                parseError();
878            m_token->addNewAttribute();
879            m_token->beginAttributeName(source.numberOfCharactersConsumed());
880            m_token->appendToAttributeName(cc);
881            HTML_ADVANCE_TO(AttributeNameState);
882        }
883    }
884    END_STATE()
885
886    HTML_BEGIN_STATE(BeforeAttributeValueState) {
887        if (isTokenizerWhitespace(cc))
888            HTML_ADVANCE_TO(BeforeAttributeValueState);
889        else if (cc == '"') {
890            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
891            HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
892        } else if (cc == '&') {
893            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
894            HTML_RECONSUME_IN(AttributeValueUnquotedState);
895        } else if (cc == '\'') {
896            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
897            HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
898        } else if (cc == '>') {
899            parseError();
900            return emitAndResumeIn(source, HTMLTokenizer::DataState);
901        } else if (cc == kEndOfFileMarker) {
902            parseError();
903            HTML_RECONSUME_IN(DataState);
904        } else {
905            if (cc == '<' || cc == '=' || cc == '`')
906                parseError();
907            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
908            m_token->appendToAttributeValue(cc);
909            HTML_ADVANCE_TO(AttributeValueUnquotedState);
910        }
911    }
912    END_STATE()
913
914    HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
915        if (cc == '"') {
916            m_token->endAttributeValue(source.numberOfCharactersConsumed());
917            HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
918        } else if (cc == '&') {
919            m_additionalAllowedCharacter = '"';
920            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
921        } else if (cc == kEndOfFileMarker) {
922            parseError();
923            m_token->endAttributeValue(source.numberOfCharactersConsumed());
924            HTML_RECONSUME_IN(DataState);
925        } else {
926            m_token->appendToAttributeValue(cc);
927            HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
928        }
929    }
930    END_STATE()
931
932    HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
933        if (cc == '\'') {
934            m_token->endAttributeValue(source.numberOfCharactersConsumed());
935            HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
936        } else if (cc == '&') {
937            m_additionalAllowedCharacter = '\'';
938            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
939        } else if (cc == kEndOfFileMarker) {
940            parseError();
941            m_token->endAttributeValue(source.numberOfCharactersConsumed());
942            HTML_RECONSUME_IN(DataState);
943        } else {
944            m_token->appendToAttributeValue(cc);
945            HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
946        }
947    }
948    END_STATE()
949
950    HTML_BEGIN_STATE(AttributeValueUnquotedState) {
951        if (isTokenizerWhitespace(cc)) {
952            m_token->endAttributeValue(source.numberOfCharactersConsumed());
953            HTML_ADVANCE_TO(BeforeAttributeNameState);
954        } else if (cc == '&') {
955            m_additionalAllowedCharacter = '>';
956            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
957        } else if (cc == '>') {
958            m_token->endAttributeValue(source.numberOfCharactersConsumed());
959            return emitAndResumeIn(source, HTMLTokenizer::DataState);
960        } else if (cc == kEndOfFileMarker) {
961            parseError();
962            m_token->endAttributeValue(source.numberOfCharactersConsumed());
963            HTML_RECONSUME_IN(DataState);
964        } else {
965            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
966                parseError();
967            m_token->appendToAttributeValue(cc);
968            HTML_ADVANCE_TO(AttributeValueUnquotedState);
969        }
970    }
971    END_STATE()
972
973    HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
974        bool notEnoughCharacters = false;
975        StringBuilder decodedEntity;
976        bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
977        if (notEnoughCharacters)
978            return haveBufferedCharacterToken();
979        if (!success) {
980            ASSERT(decodedEntity.isEmpty());
981            m_token->appendToAttributeValue('&');
982        } else {
983            for (unsigned i = 0; i < decodedEntity.length(); ++i)
984                m_token->appendToAttributeValue(decodedEntity[i]);
985        }
986        // We're supposed to switch back to the attribute value state that
987        // we were in when we were switched into this state. Rather than
988        // keeping track of this explictly, we observe that the previous
989        // state can be determined by m_additionalAllowedCharacter.
990        if (m_additionalAllowedCharacter == '"')
991            HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
992        else if (m_additionalAllowedCharacter == '\'')
993            HTML_SWITCH_TO(AttributeValueSingleQuotedState);
994        else if (m_additionalAllowedCharacter == '>')
995            HTML_SWITCH_TO(AttributeValueUnquotedState);
996        else
997            ASSERT_NOT_REACHED();
998    }
999    END_STATE()
1000
1001    HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1002        if (isTokenizerWhitespace(cc))
1003            HTML_ADVANCE_TO(BeforeAttributeNameState);
1004        else if (cc == '/')
1005            HTML_ADVANCE_TO(SelfClosingStartTagState);
1006        else if (cc == '>')
1007            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1008        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
1009            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1010        else if (cc == kEndOfFileMarker) {
1011            parseError();
1012            HTML_RECONSUME_IN(DataState);
1013        } else {
1014            parseError();
1015            HTML_RECONSUME_IN(BeforeAttributeNameState);
1016        }
1017    }
1018    END_STATE()
1019
1020    HTML_BEGIN_STATE(SelfClosingStartTagState) {
1021        if (cc == '>') {
1022            m_token->setSelfClosing();
1023            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1024        } else if (cc == kEndOfFileMarker) {
1025            parseError();
1026            HTML_RECONSUME_IN(DataState);
1027        } else {
1028            parseError();
1029            HTML_RECONSUME_IN(BeforeAttributeNameState);
1030        }
1031    }
1032    END_STATE()
1033
1034    HTML_BEGIN_STATE(BogusCommentState) {
1035        m_token->beginComment();
1036        HTML_RECONSUME_IN(ContinueBogusCommentState);
1037    }
1038    END_STATE()
1039
1040    HTML_BEGIN_STATE(ContinueBogusCommentState) {
1041        if (cc == '>')
1042            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1043        else if (cc == kEndOfFileMarker)
1044            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1045        else {
1046            m_token->appendToComment(cc);
1047            HTML_ADVANCE_TO(ContinueBogusCommentState);
1048        }
1049    }
1050    END_STATE()
1051
1052    HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1053        DEPRECATED_DEFINE_STATIC_LOCAL(String, dashDashString, (ASCIILiteral("--")));
1054        DEPRECATED_DEFINE_STATIC_LOCAL(String, doctypeString, (ASCIILiteral("doctype")));
1055        DEPRECATED_DEFINE_STATIC_LOCAL(String, cdataString, (ASCIILiteral("[CDATA[")));
1056        if (cc == '-') {
1057            SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1058            if (result == SegmentedString::DidMatch) {
1059                source.advanceAndASSERT('-');
1060                source.advanceAndASSERT('-');
1061                m_token->beginComment();
1062                HTML_SWITCH_TO(CommentStartState);
1063            } else if (result == SegmentedString::NotEnoughCharacters)
1064                return haveBufferedCharacterToken();
1065        } else if (cc == 'D' || cc == 'd') {
1066            SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1067            if (result == SegmentedString::DidMatch) {
1068                advanceStringAndASSERTIgnoringCase(source, "doctype");
1069                HTML_SWITCH_TO(DOCTYPEState);
1070            } else if (result == SegmentedString::NotEnoughCharacters)
1071                return haveBufferedCharacterToken();
1072        } else if (cc == '[' && shouldAllowCDATA()) {
1073            SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1074            if (result == SegmentedString::DidMatch) {
1075                advanceStringAndASSERT(source, "[CDATA[");
1076                HTML_SWITCH_TO(CDATASectionState);
1077            } else if (result == SegmentedString::NotEnoughCharacters)
1078                return haveBufferedCharacterToken();
1079        }
1080        parseError();
1081        HTML_RECONSUME_IN(BogusCommentState);
1082    }
1083    END_STATE()
1084
1085    HTML_BEGIN_STATE(CommentStartState) {
1086        if (cc == '-')
1087            HTML_ADVANCE_TO(CommentStartDashState);
1088        else if (cc == '>') {
1089            parseError();
1090            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1091        } else if (cc == kEndOfFileMarker) {
1092            parseError();
1093            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1094        } else {
1095            m_token->appendToComment(cc);
1096            HTML_ADVANCE_TO(CommentState);
1097        }
1098    }
1099    END_STATE()
1100
1101    HTML_BEGIN_STATE(CommentStartDashState) {
1102        if (cc == '-')
1103            HTML_ADVANCE_TO(CommentEndState);
1104        else if (cc == '>') {
1105            parseError();
1106            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1107        } else if (cc == kEndOfFileMarker) {
1108            parseError();
1109            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1110        } else {
1111            m_token->appendToComment('-');
1112            m_token->appendToComment(cc);
1113            HTML_ADVANCE_TO(CommentState);
1114        }
1115    }
1116    END_STATE()
1117
1118    HTML_BEGIN_STATE(CommentState) {
1119        if (cc == '-')
1120            HTML_ADVANCE_TO(CommentEndDashState);
1121        else if (cc == kEndOfFileMarker) {
1122            parseError();
1123            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1124        } else {
1125            m_token->appendToComment(cc);
1126            HTML_ADVANCE_TO(CommentState);
1127        }
1128    }
1129    END_STATE()
1130
1131    HTML_BEGIN_STATE(CommentEndDashState) {
1132        if (cc == '-')
1133            HTML_ADVANCE_TO(CommentEndState);
1134        else if (cc == kEndOfFileMarker) {
1135            parseError();
1136            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1137        } else {
1138            m_token->appendToComment('-');
1139            m_token->appendToComment(cc);
1140            HTML_ADVANCE_TO(CommentState);
1141        }
1142    }
1143    END_STATE()
1144
1145    HTML_BEGIN_STATE(CommentEndState) {
1146        if (cc == '>')
1147            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1148        else if (cc == '!') {
1149            parseError();
1150            HTML_ADVANCE_TO(CommentEndBangState);
1151        } else if (cc == '-') {
1152            parseError();
1153            m_token->appendToComment('-');
1154            HTML_ADVANCE_TO(CommentEndState);
1155        } else if (cc == kEndOfFileMarker) {
1156            parseError();
1157            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1158        } else {
1159            parseError();
1160            m_token->appendToComment('-');
1161            m_token->appendToComment('-');
1162            m_token->appendToComment(cc);
1163            HTML_ADVANCE_TO(CommentState);
1164        }
1165    }
1166    END_STATE()
1167
1168    HTML_BEGIN_STATE(CommentEndBangState) {
1169        if (cc == '-') {
1170            m_token->appendToComment('-');
1171            m_token->appendToComment('-');
1172            m_token->appendToComment('!');
1173            HTML_ADVANCE_TO(CommentEndDashState);
1174        } else if (cc == '>')
1175            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1176        else if (cc == kEndOfFileMarker) {
1177            parseError();
1178            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1179        } else {
1180            m_token->appendToComment('-');
1181            m_token->appendToComment('-');
1182            m_token->appendToComment('!');
1183            m_token->appendToComment(cc);
1184            HTML_ADVANCE_TO(CommentState);
1185        }
1186    }
1187    END_STATE()
1188
1189    HTML_BEGIN_STATE(DOCTYPEState) {
1190        if (isTokenizerWhitespace(cc))
1191            HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1192        else if (cc == kEndOfFileMarker) {
1193            parseError();
1194            m_token->beginDOCTYPE();
1195            m_token->setForceQuirks();
1196            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1197        } else {
1198            parseError();
1199            HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1200        }
1201    }
1202    END_STATE()
1203
1204    HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1205        if (isTokenizerWhitespace(cc))
1206            HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1207        else if (isASCIIUpper(cc)) {
1208            m_token->beginDOCTYPE(toLowerCase(cc));
1209            HTML_ADVANCE_TO(DOCTYPENameState);
1210        } else if (cc == '>') {
1211            parseError();
1212            m_token->beginDOCTYPE();
1213            m_token->setForceQuirks();
1214            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1215        } else if (cc == kEndOfFileMarker) {
1216            parseError();
1217            m_token->beginDOCTYPE();
1218            m_token->setForceQuirks();
1219            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1220        } else {
1221            m_token->beginDOCTYPE(cc);
1222            HTML_ADVANCE_TO(DOCTYPENameState);
1223        }
1224    }
1225    END_STATE()
1226
1227    HTML_BEGIN_STATE(DOCTYPENameState) {
1228        if (isTokenizerWhitespace(cc))
1229            HTML_ADVANCE_TO(AfterDOCTYPENameState);
1230        else if (cc == '>')
1231            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1232        else if (isASCIIUpper(cc)) {
1233            m_token->appendToName(toLowerCase(cc));
1234            HTML_ADVANCE_TO(DOCTYPENameState);
1235        } else if (cc == kEndOfFileMarker) {
1236            parseError();
1237            m_token->setForceQuirks();
1238            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1239        } else {
1240            m_token->appendToName(cc);
1241            HTML_ADVANCE_TO(DOCTYPENameState);
1242        }
1243    }
1244    END_STATE()
1245
1246    HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1247        if (isTokenizerWhitespace(cc))
1248            HTML_ADVANCE_TO(AfterDOCTYPENameState);
1249        if (cc == '>')
1250            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1251        else if (cc == kEndOfFileMarker) {
1252            parseError();
1253            m_token->setForceQuirks();
1254            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1255        } else {
1256            DEPRECATED_DEFINE_STATIC_LOCAL(String, publicString, (ASCIILiteral("public")));
1257            DEPRECATED_DEFINE_STATIC_LOCAL(String, systemString, (ASCIILiteral("system")));
1258            if (cc == 'P' || cc == 'p') {
1259                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1260                if (result == SegmentedString::DidMatch) {
1261                    advanceStringAndASSERTIgnoringCase(source, "public");
1262                    HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1263                } else if (result == SegmentedString::NotEnoughCharacters)
1264                    return haveBufferedCharacterToken();
1265            } else if (cc == 'S' || cc == 's') {
1266                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1267                if (result == SegmentedString::DidMatch) {
1268                    advanceStringAndASSERTIgnoringCase(source, "system");
1269                    HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1270                } else if (result == SegmentedString::NotEnoughCharacters)
1271                    return haveBufferedCharacterToken();
1272            }
1273            parseError();
1274            m_token->setForceQuirks();
1275            HTML_ADVANCE_TO(BogusDOCTYPEState);
1276        }
1277    }
1278    END_STATE()
1279
1280    HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1281        if (isTokenizerWhitespace(cc))
1282            HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1283        else if (cc == '"') {
1284            parseError();
1285            m_token->setPublicIdentifierToEmptyString();
1286            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1287        } else if (cc == '\'') {
1288            parseError();
1289            m_token->setPublicIdentifierToEmptyString();
1290            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1291        } else if (cc == '>') {
1292            parseError();
1293            m_token->setForceQuirks();
1294            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1295        } else if (cc == kEndOfFileMarker) {
1296            parseError();
1297            m_token->setForceQuirks();
1298            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1299        } else {
1300            parseError();
1301            m_token->setForceQuirks();
1302            HTML_ADVANCE_TO(BogusDOCTYPEState);
1303        }
1304    }
1305    END_STATE()
1306
1307    HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1308        if (isTokenizerWhitespace(cc))
1309            HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1310        else if (cc == '"') {
1311            m_token->setPublicIdentifierToEmptyString();
1312            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1313        } else if (cc == '\'') {
1314            m_token->setPublicIdentifierToEmptyString();
1315            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1316        } else if (cc == '>') {
1317            parseError();
1318            m_token->setForceQuirks();
1319            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1320        } else if (cc == kEndOfFileMarker) {
1321            parseError();
1322            m_token->setForceQuirks();
1323            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1324        } else {
1325            parseError();
1326            m_token->setForceQuirks();
1327            HTML_ADVANCE_TO(BogusDOCTYPEState);
1328        }
1329    }
1330    END_STATE()
1331
1332    HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1333        if (cc == '"')
1334            HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1335        else if (cc == '>') {
1336            parseError();
1337            m_token->setForceQuirks();
1338            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1339        } else if (cc == kEndOfFileMarker) {
1340            parseError();
1341            m_token->setForceQuirks();
1342            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1343        } else {
1344            m_token->appendToPublicIdentifier(cc);
1345            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1346        }
1347    }
1348    END_STATE()
1349
1350    HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1351        if (cc == '\'')
1352            HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1353        else if (cc == '>') {
1354            parseError();
1355            m_token->setForceQuirks();
1356            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1357        } else if (cc == kEndOfFileMarker) {
1358            parseError();
1359            m_token->setForceQuirks();
1360            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1361        } else {
1362            m_token->appendToPublicIdentifier(cc);
1363            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1364        }
1365    }
1366    END_STATE()
1367
1368    HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1369        if (isTokenizerWhitespace(cc))
1370            HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1371        else if (cc == '>')
1372            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1373        else if (cc == '"') {
1374            parseError();
1375            m_token->setSystemIdentifierToEmptyString();
1376            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1377        } else if (cc == '\'') {
1378            parseError();
1379            m_token->setSystemIdentifierToEmptyString();
1380            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1381        } else if (cc == kEndOfFileMarker) {
1382            parseError();
1383            m_token->setForceQuirks();
1384            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1385        } else {
1386            parseError();
1387            m_token->setForceQuirks();
1388            HTML_ADVANCE_TO(BogusDOCTYPEState);
1389        }
1390    }
1391    END_STATE()
1392
1393    HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1394        if (isTokenizerWhitespace(cc))
1395            HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1396        else if (cc == '>')
1397            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1398        else if (cc == '"') {
1399            m_token->setSystemIdentifierToEmptyString();
1400            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1401        } else if (cc == '\'') {
1402            m_token->setSystemIdentifierToEmptyString();
1403            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1404        } else if (cc == kEndOfFileMarker) {
1405            parseError();
1406            m_token->setForceQuirks();
1407            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1408        } else {
1409            parseError();
1410            m_token->setForceQuirks();
1411            HTML_ADVANCE_TO(BogusDOCTYPEState);
1412        }
1413    }
1414    END_STATE()
1415
1416    HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1417        if (isTokenizerWhitespace(cc))
1418            HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1419        else if (cc == '"') {
1420            parseError();
1421            m_token->setSystemIdentifierToEmptyString();
1422            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1423        } else if (cc == '\'') {
1424            parseError();
1425            m_token->setSystemIdentifierToEmptyString();
1426            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1427        } else if (cc == '>') {
1428            parseError();
1429            m_token->setForceQuirks();
1430            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1431        } else if (cc == kEndOfFileMarker) {
1432            parseError();
1433            m_token->setForceQuirks();
1434            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1435        } else {
1436            parseError();
1437            m_token->setForceQuirks();
1438            HTML_ADVANCE_TO(BogusDOCTYPEState);
1439        }
1440    }
1441    END_STATE()
1442
1443    HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1444        if (isTokenizerWhitespace(cc))
1445            HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1446        if (cc == '"') {
1447            m_token->setSystemIdentifierToEmptyString();
1448            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1449        } else if (cc == '\'') {
1450            m_token->setSystemIdentifierToEmptyString();
1451            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1452        } else if (cc == '>') {
1453            parseError();
1454            m_token->setForceQuirks();
1455            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1456        } else if (cc == kEndOfFileMarker) {
1457            parseError();
1458            m_token->setForceQuirks();
1459            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1460        } else {
1461            parseError();
1462            m_token->setForceQuirks();
1463            HTML_ADVANCE_TO(BogusDOCTYPEState);
1464        }
1465    }
1466    END_STATE()
1467
1468    HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1469        if (cc == '"')
1470            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1471        else if (cc == '>') {
1472            parseError();
1473            m_token->setForceQuirks();
1474            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1475        } else if (cc == kEndOfFileMarker) {
1476            parseError();
1477            m_token->setForceQuirks();
1478            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1479        } else {
1480            m_token->appendToSystemIdentifier(cc);
1481            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1482        }
1483    }
1484    END_STATE()
1485
1486    HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1487        if (cc == '\'')
1488            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1489        else if (cc == '>') {
1490            parseError();
1491            m_token->setForceQuirks();
1492            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1493        } else if (cc == kEndOfFileMarker) {
1494            parseError();
1495            m_token->setForceQuirks();
1496            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1497        } else {
1498            m_token->appendToSystemIdentifier(cc);
1499            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1500        }
1501    }
1502    END_STATE()
1503
1504    HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1505        if (isTokenizerWhitespace(cc))
1506            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1507        else if (cc == '>')
1508            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1509        else if (cc == kEndOfFileMarker) {
1510            parseError();
1511            m_token->setForceQuirks();
1512            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1513        } else {
1514            parseError();
1515            HTML_ADVANCE_TO(BogusDOCTYPEState);
1516        }
1517    }
1518    END_STATE()
1519
1520    HTML_BEGIN_STATE(BogusDOCTYPEState) {
1521        if (cc == '>')
1522            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1523        else if (cc == kEndOfFileMarker)
1524            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1525        HTML_ADVANCE_TO(BogusDOCTYPEState);
1526    }
1527    END_STATE()
1528
1529    HTML_BEGIN_STATE(CDATASectionState) {
1530        if (cc == ']')
1531            HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1532        else if (cc == kEndOfFileMarker)
1533            HTML_RECONSUME_IN(DataState);
1534        else {
1535            bufferCharacter(cc);
1536            HTML_ADVANCE_TO(CDATASectionState);
1537        }
1538    }
1539    END_STATE()
1540
1541    HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1542        if (cc == ']')
1543            HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1544        else {
1545            bufferCharacter(']');
1546            HTML_RECONSUME_IN(CDATASectionState);
1547        }
1548    }
1549
1550    HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1551        if (cc == '>')
1552            HTML_ADVANCE_TO(DataState);
1553        else {
1554            bufferCharacter(']');
1555            bufferCharacter(']');
1556            HTML_RECONSUME_IN(CDATASectionState);
1557        }
1558    }
1559    END_STATE()
1560
1561    }
1562
1563    ASSERT_NOT_REACHED();
1564    return false;
1565}
1566
1567String HTMLTokenizer::bufferedCharacters() const
1568{
1569    // FIXME: Add an assert about m_state.
1570    StringBuilder characters;
1571    characters.reserveCapacity(numberOfBufferedCharacters());
1572    characters.append('<');
1573    characters.append('/');
1574    characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1575    return characters.toString();
1576}
1577
1578void HTMLTokenizer::updateStateFor(const AtomicString& tagName)
1579{
1580    if (tagName == textareaTag || tagName == titleTag)
1581        setState(HTMLTokenizer::RCDATAState);
1582    else if (tagName == plaintextTag)
1583        setState(HTMLTokenizer::PLAINTEXTState);
1584    else if (tagName == scriptTag)
1585        setState(HTMLTokenizer::ScriptDataState);
1586    else if (tagName == styleTag
1587        || tagName == iframeTag
1588        || tagName == xmpTag
1589        || (tagName == noembedTag && m_options.pluginsEnabled)
1590        || tagName == noframesTag
1591        || (tagName == noscriptTag && m_options.scriptEnabled))
1592        setState(HTMLTokenizer::RAWTEXTState);
1593}
1594
1595inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1596{
1597    return vectorEqualsString(m_temporaryBuffer, expectedString);
1598}
1599
1600inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1601{
1602    ASSERT(isEndTagBufferingState(m_state));
1603    m_bufferedEndTagName.append(cc);
1604}
1605
1606inline bool HTMLTokenizer::isAppropriateEndTag()
1607{
1608    if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1609        return false;
1610
1611    size_t numCharacters = m_bufferedEndTagName.size();
1612
1613    for (size_t i = 0; i < numCharacters; i++) {
1614        if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1615            return false;
1616    }
1617
1618    return true;
1619}
1620
1621inline void HTMLTokenizer::parseError()
1622{
1623    notImplemented();
1624}
1625
1626}
1627