1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLTokenizer.h"
30
31#include "HTMLEntityParser.h"
32#include "HTMLToken.h"
33#include "HTMLTreeBuilder.h"
34#include "HTMLNames.h"
35#include "MarkupTokenizerInlines.h"
36#include "NotImplemented.h"
37#include <wtf/ASCIICType.h>
38#include <wtf/CurrentTime.h>
39#include <wtf/text/AtomicString.h>
40#include <wtf/text/CString.h>
41#include <wtf/unicode/Unicode.h>
42
43using namespace WTF;
44
45namespace WebCore {
46
47using namespace HTMLNames;
48
49// This has to go in a .cpp file, as the linker doesn't like it being included more than once.
50// We don't have an HTMLToken.cpp though, so this is the next best place.
51QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
52{
53    return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
54}
55
56bool AtomicHTMLToken::usesName() const
57{
58    return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
59}
60
61bool AtomicHTMLToken::usesAttributes() const
62{
63    return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
64}
65
66static inline UChar toLowerCase(UChar cc)
67{
68    ASSERT(isASCIIUpper(cc));
69    const int lowerCaseOffset = 0x20;
70    return cc + lowerCaseOffset;
71}
72
73static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
74{
75    if (vector.size() != string.length())
76        return false;
77
78    if (!string.length())
79        return true;
80
81    return equal(string.impl(), vector.data(), vector.size());
82}
83
84static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
85{
86    switch (state) {
87    case HTMLTokenizer::RCDATAEndTagOpenState:
88    case HTMLTokenizer::RCDATAEndTagNameState:
89    case HTMLTokenizer::RAWTEXTEndTagOpenState:
90    case HTMLTokenizer::RAWTEXTEndTagNameState:
91    case HTMLTokenizer::ScriptDataEndTagOpenState:
92    case HTMLTokenizer::ScriptDataEndTagNameState:
93    case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
94    case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
95        return true;
96    default:
97        return false;
98    }
99}
100
101#define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
102#define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
103#define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
104#define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
105
106HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
107    : m_inputStreamPreprocessor(this)
108    , m_options(options)
109{
110    reset();
111}
112
113HTMLTokenizer::~HTMLTokenizer()
114{
115}
116
117void HTMLTokenizer::reset()
118{
119    m_state = HTMLTokenizer::DataState;
120    m_token = 0;
121    m_forceNullCharacterReplacement = false;
122    m_shouldAllowCDATA = false;
123    m_additionalAllowedCharacter = '\0';
124}
125
126#if ENABLE(THREADED_HTML_PARSER)
127
128bool HTMLTokenizer::canCreateCheckpoint() const
129{
130    if (!m_appropriateEndTagName.isEmpty())
131        return false;
132    if (!m_temporaryBuffer.isEmpty())
133        return false;
134    if (!m_bufferedEndTagName.isEmpty())
135        return false;
136    return true;
137}
138
139void HTMLTokenizer::createCheckpoint(Checkpoint& result) const
140{
141    ASSERT(canCreateCheckpoint());
142    result.options = m_options;
143    result.state = m_state;
144    result.additionalAllowedCharacter = m_additionalAllowedCharacter;
145    result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine();
146    result.shouldAllowCDATA = m_shouldAllowCDATA;
147}
148
149void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint)
150{
151    m_token = 0;
152    m_options = checkpoint.options;
153    m_state = checkpoint.state;
154    m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter;
155    m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine);
156    m_shouldAllowCDATA = checkpoint.shouldAllowCDATA;
157}
158
159#endif
160
161inline bool HTMLTokenizer::processEntity(SegmentedString& source)
162{
163    bool notEnoughCharacters = false;
164    StringBuilder decodedEntity;
165    bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
166    if (notEnoughCharacters)
167        return false;
168    if (!success) {
169        ASSERT(decodedEntity.isEmpty());
170        bufferCharacter('&');
171    } else {
172        for (unsigned i = 0; i < decodedEntity.length(); ++i)
173            bufferCharacter(decodedEntity[i]);
174    }
175    return true;
176}
177
178bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
179{
180    ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
181    source.advanceAndUpdateLineNumber();
182    if (m_token->type() == HTMLToken::Character)
183        return true;
184    m_token->beginEndTag(m_bufferedEndTagName);
185    m_bufferedEndTagName.clear();
186    m_appropriateEndTagName.clear();
187    m_temporaryBuffer.clear();
188    return false;
189}
190
191#define FLUSH_AND_ADVANCE_TO(stateName)                                    \
192    do {                                                                   \
193        m_state = HTMLTokenizer::stateName;                           \
194        if (flushBufferedEndTag(source))                                   \
195            return true;                                                   \
196        if (source.isEmpty()                                               \
197            || !m_inputStreamPreprocessor.peek(source))                    \
198            return haveBufferedCharacterToken();                           \
199        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
200        goto stateName;                                                    \
201    } while (false)
202
203bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
204{
205    m_state = state;
206    flushBufferedEndTag(source);
207    return true;
208}
209
210bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
211{
212    // If we have a token in progress, then we're supposed to be called back
213    // with the same token so we can finish it.
214    ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
215    m_token = &token;
216
217    if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
218        // FIXME: This should call flushBufferedEndTag().
219        // We started an end tag during our last iteration.
220        m_token->beginEndTag(m_bufferedEndTagName);
221        m_bufferedEndTagName.clear();
222        m_appropriateEndTagName.clear();
223        m_temporaryBuffer.clear();
224        if (m_state == HTMLTokenizer::DataState) {
225            // We're back in the data state, so we must be done with the tag.
226            return true;
227        }
228    }
229
230    if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
231        return haveBufferedCharacterToken();
232    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
233
234    // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
235    switch (m_state) {
236    HTML_BEGIN_STATE(DataState) {
237        if (cc == '&')
238            HTML_ADVANCE_TO(CharacterReferenceInDataState);
239        else if (cc == '<') {
240            if (m_token->type() == HTMLToken::Character) {
241                // We have a bunch of character tokens queued up that we
242                // are emitting lazily here.
243                return true;
244            }
245            HTML_ADVANCE_TO(TagOpenState);
246        } else if (cc == kEndOfFileMarker)
247            return emitEndOfFile(source);
248        else {
249            bufferCharacter(cc);
250            HTML_ADVANCE_TO(DataState);
251        }
252    }
253    END_STATE()
254
255    HTML_BEGIN_STATE(CharacterReferenceInDataState) {
256        if (!processEntity(source))
257            return haveBufferedCharacterToken();
258        HTML_SWITCH_TO(DataState);
259    }
260    END_STATE()
261
262    HTML_BEGIN_STATE(RCDATAState) {
263        if (cc == '&')
264            HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
265        else if (cc == '<')
266            HTML_ADVANCE_TO(RCDATALessThanSignState);
267        else if (cc == kEndOfFileMarker)
268            return emitEndOfFile(source);
269        else {
270            bufferCharacter(cc);
271            HTML_ADVANCE_TO(RCDATAState);
272        }
273    }
274    END_STATE()
275
276    HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
277        if (!processEntity(source))
278            return haveBufferedCharacterToken();
279        HTML_SWITCH_TO(RCDATAState);
280    }
281    END_STATE()
282
283    HTML_BEGIN_STATE(RAWTEXTState) {
284        if (cc == '<')
285            HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
286        else if (cc == kEndOfFileMarker)
287            return emitEndOfFile(source);
288        else {
289            bufferCharacter(cc);
290            HTML_ADVANCE_TO(RAWTEXTState);
291        }
292    }
293    END_STATE()
294
295    HTML_BEGIN_STATE(ScriptDataState) {
296        if (cc == '<')
297            HTML_ADVANCE_TO(ScriptDataLessThanSignState);
298        else if (cc == kEndOfFileMarker)
299            return emitEndOfFile(source);
300        else {
301            bufferCharacter(cc);
302            HTML_ADVANCE_TO(ScriptDataState);
303        }
304    }
305    END_STATE()
306
307    HTML_BEGIN_STATE(PLAINTEXTState) {
308        if (cc == kEndOfFileMarker)
309            return emitEndOfFile(source);
310        bufferCharacter(cc);
311        HTML_ADVANCE_TO(PLAINTEXTState);
312    }
313    END_STATE()
314
315    HTML_BEGIN_STATE(TagOpenState) {
316        if (cc == '!')
317            HTML_ADVANCE_TO(MarkupDeclarationOpenState);
318        else if (cc == '/')
319            HTML_ADVANCE_TO(EndTagOpenState);
320        else if (isASCIIUpper(cc)) {
321            m_token->beginStartTag(toLowerCase(cc));
322            HTML_ADVANCE_TO(TagNameState);
323        } else if (isASCIILower(cc)) {
324            m_token->beginStartTag(cc);
325            HTML_ADVANCE_TO(TagNameState);
326        } else if (cc == '?') {
327            parseError();
328            // The spec consumes the current character before switching
329            // to the bogus comment state, but it's easier to implement
330            // if we reconsume the current character.
331            HTML_RECONSUME_IN(BogusCommentState);
332        } else {
333            parseError();
334            bufferCharacter('<');
335            HTML_RECONSUME_IN(DataState);
336        }
337    }
338    END_STATE()
339
340    HTML_BEGIN_STATE(EndTagOpenState) {
341        if (isASCIIUpper(cc)) {
342            m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
343            m_appropriateEndTagName.clear();
344            HTML_ADVANCE_TO(TagNameState);
345        } else if (isASCIILower(cc)) {
346            m_token->beginEndTag(static_cast<LChar>(cc));
347            m_appropriateEndTagName.clear();
348            HTML_ADVANCE_TO(TagNameState);
349        } else if (cc == '>') {
350            parseError();
351            HTML_ADVANCE_TO(DataState);
352        } else if (cc == kEndOfFileMarker) {
353            parseError();
354            bufferCharacter('<');
355            bufferCharacter('/');
356            HTML_RECONSUME_IN(DataState);
357        } else {
358            parseError();
359            HTML_RECONSUME_IN(BogusCommentState);
360        }
361    }
362    END_STATE()
363
364    HTML_BEGIN_STATE(TagNameState) {
365        if (isTokenizerWhitespace(cc))
366            HTML_ADVANCE_TO(BeforeAttributeNameState);
367        else if (cc == '/')
368            HTML_ADVANCE_TO(SelfClosingStartTagState);
369        else if (cc == '>')
370            return emitAndResumeIn(source, HTMLTokenizer::DataState);
371        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
372            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
373        else if (isASCIIUpper(cc)) {
374            m_token->appendToName(toLowerCase(cc));
375            HTML_ADVANCE_TO(TagNameState);
376        } else if (cc == kEndOfFileMarker) {
377            parseError();
378            HTML_RECONSUME_IN(DataState);
379        } else {
380            m_token->appendToName(cc);
381            HTML_ADVANCE_TO(TagNameState);
382        }
383    }
384    END_STATE()
385
386    HTML_BEGIN_STATE(RCDATALessThanSignState) {
387        if (cc == '/') {
388            m_temporaryBuffer.clear();
389            ASSERT(m_bufferedEndTagName.isEmpty());
390            HTML_ADVANCE_TO(RCDATAEndTagOpenState);
391        } else {
392            bufferCharacter('<');
393            HTML_RECONSUME_IN(RCDATAState);
394        }
395    }
396    END_STATE()
397
398    HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
399        if (isASCIIUpper(cc)) {
400            m_temporaryBuffer.append(static_cast<LChar>(cc));
401            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
402            HTML_ADVANCE_TO(RCDATAEndTagNameState);
403        } else if (isASCIILower(cc)) {
404            m_temporaryBuffer.append(static_cast<LChar>(cc));
405            addToPossibleEndTag(static_cast<LChar>(cc));
406            HTML_ADVANCE_TO(RCDATAEndTagNameState);
407        } else {
408            bufferCharacter('<');
409            bufferCharacter('/');
410            HTML_RECONSUME_IN(RCDATAState);
411        }
412    }
413    END_STATE()
414
415    HTML_BEGIN_STATE(RCDATAEndTagNameState) {
416        if (isASCIIUpper(cc)) {
417            m_temporaryBuffer.append(static_cast<LChar>(cc));
418            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
419            HTML_ADVANCE_TO(RCDATAEndTagNameState);
420        } else if (isASCIILower(cc)) {
421            m_temporaryBuffer.append(static_cast<LChar>(cc));
422            addToPossibleEndTag(static_cast<LChar>(cc));
423            HTML_ADVANCE_TO(RCDATAEndTagNameState);
424        } else {
425            if (isTokenizerWhitespace(cc)) {
426                if (isAppropriateEndTag()) {
427                    m_temporaryBuffer.append(static_cast<LChar>(cc));
428                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
429                }
430            } else if (cc == '/') {
431                if (isAppropriateEndTag()) {
432                    m_temporaryBuffer.append(static_cast<LChar>(cc));
433                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
434                }
435            } else if (cc == '>') {
436                if (isAppropriateEndTag()) {
437                    m_temporaryBuffer.append(static_cast<LChar>(cc));
438                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
439                }
440            }
441            bufferCharacter('<');
442            bufferCharacter('/');
443            m_token->appendToCharacter(m_temporaryBuffer);
444            m_bufferedEndTagName.clear();
445            m_temporaryBuffer.clear();
446            HTML_RECONSUME_IN(RCDATAState);
447        }
448    }
449    END_STATE()
450
451    HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
452        if (cc == '/') {
453            m_temporaryBuffer.clear();
454            ASSERT(m_bufferedEndTagName.isEmpty());
455            HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
456        } else {
457            bufferCharacter('<');
458            HTML_RECONSUME_IN(RAWTEXTState);
459        }
460    }
461    END_STATE()
462
463    HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
464        if (isASCIIUpper(cc)) {
465            m_temporaryBuffer.append(static_cast<LChar>(cc));
466            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
467            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
468        } else if (isASCIILower(cc)) {
469            m_temporaryBuffer.append(static_cast<LChar>(cc));
470            addToPossibleEndTag(static_cast<LChar>(cc));
471            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
472        } else {
473            bufferCharacter('<');
474            bufferCharacter('/');
475            HTML_RECONSUME_IN(RAWTEXTState);
476        }
477    }
478    END_STATE()
479
480    HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
481        if (isASCIIUpper(cc)) {
482            m_temporaryBuffer.append(static_cast<LChar>(cc));
483            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
484            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
485        } else if (isASCIILower(cc)) {
486            m_temporaryBuffer.append(static_cast<LChar>(cc));
487            addToPossibleEndTag(static_cast<LChar>(cc));
488            HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
489        } else {
490            if (isTokenizerWhitespace(cc)) {
491                if (isAppropriateEndTag()) {
492                    m_temporaryBuffer.append(static_cast<LChar>(cc));
493                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
494                }
495            } else if (cc == '/') {
496                if (isAppropriateEndTag()) {
497                    m_temporaryBuffer.append(static_cast<LChar>(cc));
498                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
499                }
500            } else if (cc == '>') {
501                if (isAppropriateEndTag()) {
502                    m_temporaryBuffer.append(static_cast<LChar>(cc));
503                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
504                }
505            }
506            bufferCharacter('<');
507            bufferCharacter('/');
508            m_token->appendToCharacter(m_temporaryBuffer);
509            m_bufferedEndTagName.clear();
510            m_temporaryBuffer.clear();
511            HTML_RECONSUME_IN(RAWTEXTState);
512        }
513    }
514    END_STATE()
515
516    HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
517        if (cc == '/') {
518            m_temporaryBuffer.clear();
519            ASSERT(m_bufferedEndTagName.isEmpty());
520            HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
521        } else if (cc == '!') {
522            bufferCharacter('<');
523            bufferCharacter('!');
524            HTML_ADVANCE_TO(ScriptDataEscapeStartState);
525        } else {
526            bufferCharacter('<');
527            HTML_RECONSUME_IN(ScriptDataState);
528        }
529    }
530    END_STATE()
531
532    HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
533        if (isASCIIUpper(cc)) {
534            m_temporaryBuffer.append(static_cast<LChar>(cc));
535            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
536            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
537        } else if (isASCIILower(cc)) {
538            m_temporaryBuffer.append(static_cast<LChar>(cc));
539            addToPossibleEndTag(static_cast<LChar>(cc));
540            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
541        } else {
542            bufferCharacter('<');
543            bufferCharacter('/');
544            HTML_RECONSUME_IN(ScriptDataState);
545        }
546    }
547    END_STATE()
548
549    HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
550        if (isASCIIUpper(cc)) {
551            m_temporaryBuffer.append(static_cast<LChar>(cc));
552            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
553            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
554        } else if (isASCIILower(cc)) {
555            m_temporaryBuffer.append(static_cast<LChar>(cc));
556            addToPossibleEndTag(static_cast<LChar>(cc));
557            HTML_ADVANCE_TO(ScriptDataEndTagNameState);
558        } else {
559            if (isTokenizerWhitespace(cc)) {
560                if (isAppropriateEndTag()) {
561                    m_temporaryBuffer.append(static_cast<LChar>(cc));
562                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
563                }
564            } else if (cc == '/') {
565                if (isAppropriateEndTag()) {
566                    m_temporaryBuffer.append(static_cast<LChar>(cc));
567                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
568                }
569            } else if (cc == '>') {
570                if (isAppropriateEndTag()) {
571                    m_temporaryBuffer.append(static_cast<LChar>(cc));
572                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
573                }
574            }
575            bufferCharacter('<');
576            bufferCharacter('/');
577            m_token->appendToCharacter(m_temporaryBuffer);
578            m_bufferedEndTagName.clear();
579            m_temporaryBuffer.clear();
580            HTML_RECONSUME_IN(ScriptDataState);
581        }
582    }
583    END_STATE()
584
585    HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
586        if (cc == '-') {
587            bufferCharacter(cc);
588            HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
589        } else
590            HTML_RECONSUME_IN(ScriptDataState);
591    }
592    END_STATE()
593
594    HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
595        if (cc == '-') {
596            bufferCharacter(cc);
597            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
598        } else
599            HTML_RECONSUME_IN(ScriptDataState);
600    }
601    END_STATE()
602
603    HTML_BEGIN_STATE(ScriptDataEscapedState) {
604        if (cc == '-') {
605            bufferCharacter(cc);
606            HTML_ADVANCE_TO(ScriptDataEscapedDashState);
607        } else if (cc == '<')
608            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
609        else if (cc == kEndOfFileMarker) {
610            parseError();
611            HTML_RECONSUME_IN(DataState);
612        } else {
613            bufferCharacter(cc);
614            HTML_ADVANCE_TO(ScriptDataEscapedState);
615        }
616    }
617    END_STATE()
618
619    HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
620        if (cc == '-') {
621            bufferCharacter(cc);
622            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
623        } else if (cc == '<')
624            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
625        else if (cc == kEndOfFileMarker) {
626            parseError();
627            HTML_RECONSUME_IN(DataState);
628        } else {
629            bufferCharacter(cc);
630            HTML_ADVANCE_TO(ScriptDataEscapedState);
631        }
632    }
633    END_STATE()
634
635    HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
636        if (cc == '-') {
637            bufferCharacter(cc);
638            HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
639        } else if (cc == '<')
640            HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
641        else if (cc == '>') {
642            bufferCharacter(cc);
643            HTML_ADVANCE_TO(ScriptDataState);
644        } else if (cc == kEndOfFileMarker) {
645            parseError();
646            HTML_RECONSUME_IN(DataState);
647        } else {
648            bufferCharacter(cc);
649            HTML_ADVANCE_TO(ScriptDataEscapedState);
650        }
651    }
652    END_STATE()
653
654    HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
655        if (cc == '/') {
656            m_temporaryBuffer.clear();
657            ASSERT(m_bufferedEndTagName.isEmpty());
658            HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
659        } else if (isASCIIUpper(cc)) {
660            bufferCharacter('<');
661            bufferCharacter(cc);
662            m_temporaryBuffer.clear();
663            m_temporaryBuffer.append(toLowerCase(cc));
664            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
665        } else if (isASCIILower(cc)) {
666            bufferCharacter('<');
667            bufferCharacter(cc);
668            m_temporaryBuffer.clear();
669            m_temporaryBuffer.append(static_cast<LChar>(cc));
670            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
671        } else {
672            bufferCharacter('<');
673            HTML_RECONSUME_IN(ScriptDataEscapedState);
674        }
675    }
676    END_STATE()
677
678    HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
679        if (isASCIIUpper(cc)) {
680            m_temporaryBuffer.append(static_cast<LChar>(cc));
681            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
682            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
683        } else if (isASCIILower(cc)) {
684            m_temporaryBuffer.append(static_cast<LChar>(cc));
685            addToPossibleEndTag(static_cast<LChar>(cc));
686            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
687        } else {
688            bufferCharacter('<');
689            bufferCharacter('/');
690            HTML_RECONSUME_IN(ScriptDataEscapedState);
691        }
692    }
693    END_STATE()
694
695    HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
696        if (isASCIIUpper(cc)) {
697            m_temporaryBuffer.append(static_cast<LChar>(cc));
698            addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
699            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
700        } else if (isASCIILower(cc)) {
701            m_temporaryBuffer.append(static_cast<LChar>(cc));
702            addToPossibleEndTag(static_cast<LChar>(cc));
703            HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
704        } else {
705            if (isTokenizerWhitespace(cc)) {
706                if (isAppropriateEndTag()) {
707                    m_temporaryBuffer.append(static_cast<LChar>(cc));
708                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
709                }
710            } else if (cc == '/') {
711                if (isAppropriateEndTag()) {
712                    m_temporaryBuffer.append(static_cast<LChar>(cc));
713                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
714                }
715            } else if (cc == '>') {
716                if (isAppropriateEndTag()) {
717                    m_temporaryBuffer.append(static_cast<LChar>(cc));
718                    return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
719                }
720            }
721            bufferCharacter('<');
722            bufferCharacter('/');
723            m_token->appendToCharacter(m_temporaryBuffer);
724            m_bufferedEndTagName.clear();
725            m_temporaryBuffer.clear();
726            HTML_RECONSUME_IN(ScriptDataEscapedState);
727        }
728    }
729    END_STATE()
730
731    HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
732        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
733            bufferCharacter(cc);
734            if (temporaryBufferIs(scriptTag.localName()))
735                HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
736            else
737                HTML_ADVANCE_TO(ScriptDataEscapedState);
738        } else if (isASCIIUpper(cc)) {
739            bufferCharacter(cc);
740            m_temporaryBuffer.append(toLowerCase(cc));
741            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
742        } else if (isASCIILower(cc)) {
743            bufferCharacter(cc);
744            m_temporaryBuffer.append(static_cast<LChar>(cc));
745            HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
746        } else
747            HTML_RECONSUME_IN(ScriptDataEscapedState);
748    }
749    END_STATE()
750
751    HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
752        if (cc == '-') {
753            bufferCharacter(cc);
754            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
755        } else if (cc == '<') {
756            bufferCharacter(cc);
757            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
758        } else if (cc == kEndOfFileMarker) {
759            parseError();
760            HTML_RECONSUME_IN(DataState);
761        } else {
762            bufferCharacter(cc);
763            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
764        }
765    }
766    END_STATE()
767
768    HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
769        if (cc == '-') {
770            bufferCharacter(cc);
771            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
772        } else if (cc == '<') {
773            bufferCharacter(cc);
774            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
775        } else if (cc == kEndOfFileMarker) {
776            parseError();
777            HTML_RECONSUME_IN(DataState);
778        } else {
779            bufferCharacter(cc);
780            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
781        }
782    }
783    END_STATE()
784
785    HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
786        if (cc == '-') {
787            bufferCharacter(cc);
788            HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
789        } else if (cc == '<') {
790            bufferCharacter(cc);
791            HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
792        } else if (cc == '>') {
793            bufferCharacter(cc);
794            HTML_ADVANCE_TO(ScriptDataState);
795        } else if (cc == kEndOfFileMarker) {
796            parseError();
797            HTML_RECONSUME_IN(DataState);
798        } else {
799            bufferCharacter(cc);
800            HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
801        }
802    }
803    END_STATE()
804
805    HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
806        if (cc == '/') {
807            bufferCharacter(cc);
808            m_temporaryBuffer.clear();
809            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
810        } else
811            HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
812    }
813    END_STATE()
814
815    HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
816        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
817            bufferCharacter(cc);
818            if (temporaryBufferIs(scriptTag.localName()))
819                HTML_ADVANCE_TO(ScriptDataEscapedState);
820            else
821                HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
822        } else if (isASCIIUpper(cc)) {
823            bufferCharacter(cc);
824            m_temporaryBuffer.append(toLowerCase(cc));
825            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
826        } else if (isASCIILower(cc)) {
827            bufferCharacter(cc);
828            m_temporaryBuffer.append(static_cast<LChar>(cc));
829            HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
830        } else
831            HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
832    }
833    END_STATE()
834
835    HTML_BEGIN_STATE(BeforeAttributeNameState) {
836        if (isTokenizerWhitespace(cc))
837            HTML_ADVANCE_TO(BeforeAttributeNameState);
838        else if (cc == '/')
839            HTML_ADVANCE_TO(SelfClosingStartTagState);
840        else if (cc == '>')
841            return emitAndResumeIn(source, HTMLTokenizer::DataState);
842        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
843            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
844        else if (isASCIIUpper(cc)) {
845            m_token->addNewAttribute();
846            m_token->beginAttributeName(source.numberOfCharactersConsumed());
847            m_token->appendToAttributeName(toLowerCase(cc));
848            HTML_ADVANCE_TO(AttributeNameState);
849        } else if (cc == kEndOfFileMarker) {
850            parseError();
851            HTML_RECONSUME_IN(DataState);
852        } else {
853            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
854                parseError();
855            m_token->addNewAttribute();
856            m_token->beginAttributeName(source.numberOfCharactersConsumed());
857            m_token->appendToAttributeName(cc);
858            HTML_ADVANCE_TO(AttributeNameState);
859        }
860    }
861    END_STATE()
862
863    HTML_BEGIN_STATE(AttributeNameState) {
864        if (isTokenizerWhitespace(cc)) {
865            m_token->endAttributeName(source.numberOfCharactersConsumed());
866            HTML_ADVANCE_TO(AfterAttributeNameState);
867        } else if (cc == '/') {
868            m_token->endAttributeName(source.numberOfCharactersConsumed());
869            HTML_ADVANCE_TO(SelfClosingStartTagState);
870        } else if (cc == '=') {
871            m_token->endAttributeName(source.numberOfCharactersConsumed());
872            HTML_ADVANCE_TO(BeforeAttributeValueState);
873        } else if (cc == '>') {
874            m_token->endAttributeName(source.numberOfCharactersConsumed());
875            return emitAndResumeIn(source, HTMLTokenizer::DataState);
876        } else if (m_options.usePreHTML5ParserQuirks && cc == '<') {
877            m_token->endAttributeName(source.numberOfCharactersConsumed());
878            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
879        } else if (isASCIIUpper(cc)) {
880            m_token->appendToAttributeName(toLowerCase(cc));
881            HTML_ADVANCE_TO(AttributeNameState);
882        } else if (cc == kEndOfFileMarker) {
883            parseError();
884            m_token->endAttributeName(source.numberOfCharactersConsumed());
885            HTML_RECONSUME_IN(DataState);
886        } else {
887            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
888                parseError();
889            m_token->appendToAttributeName(cc);
890            HTML_ADVANCE_TO(AttributeNameState);
891        }
892    }
893    END_STATE()
894
895    HTML_BEGIN_STATE(AfterAttributeNameState) {
896        if (isTokenizerWhitespace(cc))
897            HTML_ADVANCE_TO(AfterAttributeNameState);
898        else if (cc == '/')
899            HTML_ADVANCE_TO(SelfClosingStartTagState);
900        else if (cc == '=')
901            HTML_ADVANCE_TO(BeforeAttributeValueState);
902        else if (cc == '>')
903            return emitAndResumeIn(source, HTMLTokenizer::DataState);
904        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
905            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
906        else if (isASCIIUpper(cc)) {
907            m_token->addNewAttribute();
908            m_token->beginAttributeName(source.numberOfCharactersConsumed());
909            m_token->appendToAttributeName(toLowerCase(cc));
910            HTML_ADVANCE_TO(AttributeNameState);
911        } else if (cc == kEndOfFileMarker) {
912            parseError();
913            HTML_RECONSUME_IN(DataState);
914        } else {
915            if (cc == '"' || cc == '\'' || cc == '<')
916                parseError();
917            m_token->addNewAttribute();
918            m_token->beginAttributeName(source.numberOfCharactersConsumed());
919            m_token->appendToAttributeName(cc);
920            HTML_ADVANCE_TO(AttributeNameState);
921        }
922    }
923    END_STATE()
924
925    HTML_BEGIN_STATE(BeforeAttributeValueState) {
926        if (isTokenizerWhitespace(cc))
927            HTML_ADVANCE_TO(BeforeAttributeValueState);
928        else if (cc == '"') {
929            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
930            HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
931        } else if (cc == '&') {
932            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
933            HTML_RECONSUME_IN(AttributeValueUnquotedState);
934        } else if (cc == '\'') {
935            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
936            HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
937        } else if (cc == '>') {
938            parseError();
939            return emitAndResumeIn(source, HTMLTokenizer::DataState);
940        } else if (cc == kEndOfFileMarker) {
941            parseError();
942            HTML_RECONSUME_IN(DataState);
943        } else {
944            if (cc == '<' || cc == '=' || cc == '`')
945                parseError();
946            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
947            m_token->appendToAttributeValue(cc);
948            HTML_ADVANCE_TO(AttributeValueUnquotedState);
949        }
950    }
951    END_STATE()
952
953    HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
954        if (cc == '"') {
955            m_token->endAttributeValue(source.numberOfCharactersConsumed());
956            HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
957        } else if (cc == '&') {
958            m_additionalAllowedCharacter = '"';
959            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
960        } else if (cc == kEndOfFileMarker) {
961            parseError();
962            m_token->endAttributeValue(source.numberOfCharactersConsumed());
963            HTML_RECONSUME_IN(DataState);
964        } else {
965            m_token->appendToAttributeValue(cc);
966            HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
967        }
968    }
969    END_STATE()
970
971    HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
972        if (cc == '\'') {
973            m_token->endAttributeValue(source.numberOfCharactersConsumed());
974            HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
975        } else if (cc == '&') {
976            m_additionalAllowedCharacter = '\'';
977            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
978        } else if (cc == kEndOfFileMarker) {
979            parseError();
980            m_token->endAttributeValue(source.numberOfCharactersConsumed());
981            HTML_RECONSUME_IN(DataState);
982        } else {
983            m_token->appendToAttributeValue(cc);
984            HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
985        }
986    }
987    END_STATE()
988
989    HTML_BEGIN_STATE(AttributeValueUnquotedState) {
990        if (isTokenizerWhitespace(cc)) {
991            m_token->endAttributeValue(source.numberOfCharactersConsumed());
992            HTML_ADVANCE_TO(BeforeAttributeNameState);
993        } else if (cc == '&') {
994            m_additionalAllowedCharacter = '>';
995            HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
996        } else if (cc == '>') {
997            m_token->endAttributeValue(source.numberOfCharactersConsumed());
998            return emitAndResumeIn(source, HTMLTokenizer::DataState);
999        } else if (cc == kEndOfFileMarker) {
1000            parseError();
1001            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1002            HTML_RECONSUME_IN(DataState);
1003        } else {
1004            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1005                parseError();
1006            m_token->appendToAttributeValue(cc);
1007            HTML_ADVANCE_TO(AttributeValueUnquotedState);
1008        }
1009    }
1010    END_STATE()
1011
1012    HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1013        bool notEnoughCharacters = false;
1014        StringBuilder decodedEntity;
1015        bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1016        if (notEnoughCharacters)
1017            return haveBufferedCharacterToken();
1018        if (!success) {
1019            ASSERT(decodedEntity.isEmpty());
1020            m_token->appendToAttributeValue('&');
1021        } else {
1022            for (unsigned i = 0; i < decodedEntity.length(); ++i)
1023                m_token->appendToAttributeValue(decodedEntity[i]);
1024        }
1025        // We're supposed to switch back to the attribute value state that
1026        // we were in when we were switched into this state. Rather than
1027        // keeping track of this explictly, we observe that the previous
1028        // state can be determined by m_additionalAllowedCharacter.
1029        if (m_additionalAllowedCharacter == '"')
1030            HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1031        else if (m_additionalAllowedCharacter == '\'')
1032            HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1033        else if (m_additionalAllowedCharacter == '>')
1034            HTML_SWITCH_TO(AttributeValueUnquotedState);
1035        else
1036            ASSERT_NOT_REACHED();
1037    }
1038    END_STATE()
1039
1040    HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1041        if (isTokenizerWhitespace(cc))
1042            HTML_ADVANCE_TO(BeforeAttributeNameState);
1043        else if (cc == '/')
1044            HTML_ADVANCE_TO(SelfClosingStartTagState);
1045        else if (cc == '>')
1046            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1047        else if (m_options.usePreHTML5ParserQuirks && cc == '<')
1048            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1049        else if (cc == kEndOfFileMarker) {
1050            parseError();
1051            HTML_RECONSUME_IN(DataState);
1052        } else {
1053            parseError();
1054            HTML_RECONSUME_IN(BeforeAttributeNameState);
1055        }
1056    }
1057    END_STATE()
1058
1059    HTML_BEGIN_STATE(SelfClosingStartTagState) {
1060        if (cc == '>') {
1061            m_token->setSelfClosing();
1062            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1063        } else if (cc == kEndOfFileMarker) {
1064            parseError();
1065            HTML_RECONSUME_IN(DataState);
1066        } else {
1067            parseError();
1068            HTML_RECONSUME_IN(BeforeAttributeNameState);
1069        }
1070    }
1071    END_STATE()
1072
1073    HTML_BEGIN_STATE(BogusCommentState) {
1074        m_token->beginComment();
1075        HTML_RECONSUME_IN(ContinueBogusCommentState);
1076    }
1077    END_STATE()
1078
1079    HTML_BEGIN_STATE(ContinueBogusCommentState) {
1080        if (cc == '>')
1081            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1082        else if (cc == kEndOfFileMarker)
1083            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1084        else {
1085            m_token->appendToComment(cc);
1086            HTML_ADVANCE_TO(ContinueBogusCommentState);
1087        }
1088    }
1089    END_STATE()
1090
1091    HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1092        DEFINE_STATIC_LOCAL(String, dashDashString, (ASCIILiteral("--")));
1093        DEFINE_STATIC_LOCAL(String, doctypeString, (ASCIILiteral("doctype")));
1094        DEFINE_STATIC_LOCAL(String, cdataString, (ASCIILiteral("[CDATA[")));
1095        if (cc == '-') {
1096            SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1097            if (result == SegmentedString::DidMatch) {
1098                source.advanceAndASSERT('-');
1099                source.advanceAndASSERT('-');
1100                m_token->beginComment();
1101                HTML_SWITCH_TO(CommentStartState);
1102            } else if (result == SegmentedString::NotEnoughCharacters)
1103                return haveBufferedCharacterToken();
1104        } else if (cc == 'D' || cc == 'd') {
1105            SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1106            if (result == SegmentedString::DidMatch) {
1107                advanceStringAndASSERTIgnoringCase(source, "doctype");
1108                HTML_SWITCH_TO(DOCTYPEState);
1109            } else if (result == SegmentedString::NotEnoughCharacters)
1110                return haveBufferedCharacterToken();
1111        } else if (cc == '[' && shouldAllowCDATA()) {
1112            SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1113            if (result == SegmentedString::DidMatch) {
1114                advanceStringAndASSERT(source, "[CDATA[");
1115                HTML_SWITCH_TO(CDATASectionState);
1116            } else if (result == SegmentedString::NotEnoughCharacters)
1117                return haveBufferedCharacterToken();
1118        }
1119        parseError();
1120        HTML_RECONSUME_IN(BogusCommentState);
1121    }
1122    END_STATE()
1123
1124    HTML_BEGIN_STATE(CommentStartState) {
1125        if (cc == '-')
1126            HTML_ADVANCE_TO(CommentStartDashState);
1127        else if (cc == '>') {
1128            parseError();
1129            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1130        } else if (cc == kEndOfFileMarker) {
1131            parseError();
1132            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1133        } else {
1134            m_token->appendToComment(cc);
1135            HTML_ADVANCE_TO(CommentState);
1136        }
1137    }
1138    END_STATE()
1139
1140    HTML_BEGIN_STATE(CommentStartDashState) {
1141        if (cc == '-')
1142            HTML_ADVANCE_TO(CommentEndState);
1143        else if (cc == '>') {
1144            parseError();
1145            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1146        } else if (cc == kEndOfFileMarker) {
1147            parseError();
1148            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1149        } else {
1150            m_token->appendToComment('-');
1151            m_token->appendToComment(cc);
1152            HTML_ADVANCE_TO(CommentState);
1153        }
1154    }
1155    END_STATE()
1156
1157    HTML_BEGIN_STATE(CommentState) {
1158        if (cc == '-')
1159            HTML_ADVANCE_TO(CommentEndDashState);
1160        else if (cc == kEndOfFileMarker) {
1161            parseError();
1162            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1163        } else {
1164            m_token->appendToComment(cc);
1165            HTML_ADVANCE_TO(CommentState);
1166        }
1167    }
1168    END_STATE()
1169
1170    HTML_BEGIN_STATE(CommentEndDashState) {
1171        if (cc == '-')
1172            HTML_ADVANCE_TO(CommentEndState);
1173        else if (cc == kEndOfFileMarker) {
1174            parseError();
1175            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1176        } else {
1177            m_token->appendToComment('-');
1178            m_token->appendToComment(cc);
1179            HTML_ADVANCE_TO(CommentState);
1180        }
1181    }
1182    END_STATE()
1183
1184    HTML_BEGIN_STATE(CommentEndState) {
1185        if (cc == '>')
1186            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1187        else if (cc == '!') {
1188            parseError();
1189            HTML_ADVANCE_TO(CommentEndBangState);
1190        } else if (cc == '-') {
1191            parseError();
1192            m_token->appendToComment('-');
1193            HTML_ADVANCE_TO(CommentEndState);
1194        } else if (cc == kEndOfFileMarker) {
1195            parseError();
1196            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1197        } else {
1198            parseError();
1199            m_token->appendToComment('-');
1200            m_token->appendToComment('-');
1201            m_token->appendToComment(cc);
1202            HTML_ADVANCE_TO(CommentState);
1203        }
1204    }
1205    END_STATE()
1206
1207    HTML_BEGIN_STATE(CommentEndBangState) {
1208        if (cc == '-') {
1209            m_token->appendToComment('-');
1210            m_token->appendToComment('-');
1211            m_token->appendToComment('!');
1212            HTML_ADVANCE_TO(CommentEndDashState);
1213        } else if (cc == '>')
1214            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1215        else if (cc == kEndOfFileMarker) {
1216            parseError();
1217            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1218        } else {
1219            m_token->appendToComment('-');
1220            m_token->appendToComment('-');
1221            m_token->appendToComment('!');
1222            m_token->appendToComment(cc);
1223            HTML_ADVANCE_TO(CommentState);
1224        }
1225    }
1226    END_STATE()
1227
1228    HTML_BEGIN_STATE(DOCTYPEState) {
1229        if (isTokenizerWhitespace(cc))
1230            HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1231        else if (cc == kEndOfFileMarker) {
1232            parseError();
1233            m_token->beginDOCTYPE();
1234            m_token->setForceQuirks();
1235            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1236        } else {
1237            parseError();
1238            HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1239        }
1240    }
1241    END_STATE()
1242
1243    HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1244        if (isTokenizerWhitespace(cc))
1245            HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1246        else if (isASCIIUpper(cc)) {
1247            m_token->beginDOCTYPE(toLowerCase(cc));
1248            HTML_ADVANCE_TO(DOCTYPENameState);
1249        } else if (cc == '>') {
1250            parseError();
1251            m_token->beginDOCTYPE();
1252            m_token->setForceQuirks();
1253            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1254        } else if (cc == kEndOfFileMarker) {
1255            parseError();
1256            m_token->beginDOCTYPE();
1257            m_token->setForceQuirks();
1258            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1259        } else {
1260            m_token->beginDOCTYPE(cc);
1261            HTML_ADVANCE_TO(DOCTYPENameState);
1262        }
1263    }
1264    END_STATE()
1265
1266    HTML_BEGIN_STATE(DOCTYPENameState) {
1267        if (isTokenizerWhitespace(cc))
1268            HTML_ADVANCE_TO(AfterDOCTYPENameState);
1269        else if (cc == '>')
1270            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1271        else if (isASCIIUpper(cc)) {
1272            m_token->appendToName(toLowerCase(cc));
1273            HTML_ADVANCE_TO(DOCTYPENameState);
1274        } else if (cc == kEndOfFileMarker) {
1275            parseError();
1276            m_token->setForceQuirks();
1277            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1278        } else {
1279            m_token->appendToName(cc);
1280            HTML_ADVANCE_TO(DOCTYPENameState);
1281        }
1282    }
1283    END_STATE()
1284
1285    HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1286        if (isTokenizerWhitespace(cc))
1287            HTML_ADVANCE_TO(AfterDOCTYPENameState);
1288        if (cc == '>')
1289            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1290        else if (cc == kEndOfFileMarker) {
1291            parseError();
1292            m_token->setForceQuirks();
1293            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1294        } else {
1295            DEFINE_STATIC_LOCAL(String, publicString, (ASCIILiteral("public")));
1296            DEFINE_STATIC_LOCAL(String, systemString, (ASCIILiteral("system")));
1297            if (cc == 'P' || cc == 'p') {
1298                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1299                if (result == SegmentedString::DidMatch) {
1300                    advanceStringAndASSERTIgnoringCase(source, "public");
1301                    HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1302                } else if (result == SegmentedString::NotEnoughCharacters)
1303                    return haveBufferedCharacterToken();
1304            } else if (cc == 'S' || cc == 's') {
1305                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1306                if (result == SegmentedString::DidMatch) {
1307                    advanceStringAndASSERTIgnoringCase(source, "system");
1308                    HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1309                } else if (result == SegmentedString::NotEnoughCharacters)
1310                    return haveBufferedCharacterToken();
1311            }
1312            parseError();
1313            m_token->setForceQuirks();
1314            HTML_ADVANCE_TO(BogusDOCTYPEState);
1315        }
1316    }
1317    END_STATE()
1318
1319    HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1320        if (isTokenizerWhitespace(cc))
1321            HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1322        else if (cc == '"') {
1323            parseError();
1324            m_token->setPublicIdentifierToEmptyString();
1325            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1326        } else if (cc == '\'') {
1327            parseError();
1328            m_token->setPublicIdentifierToEmptyString();
1329            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1330        } else if (cc == '>') {
1331            parseError();
1332            m_token->setForceQuirks();
1333            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1334        } else if (cc == kEndOfFileMarker) {
1335            parseError();
1336            m_token->setForceQuirks();
1337            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1338        } else {
1339            parseError();
1340            m_token->setForceQuirks();
1341            HTML_ADVANCE_TO(BogusDOCTYPEState);
1342        }
1343    }
1344    END_STATE()
1345
1346    HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1347        if (isTokenizerWhitespace(cc))
1348            HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1349        else if (cc == '"') {
1350            m_token->setPublicIdentifierToEmptyString();
1351            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1352        } else if (cc == '\'') {
1353            m_token->setPublicIdentifierToEmptyString();
1354            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1355        } else if (cc == '>') {
1356            parseError();
1357            m_token->setForceQuirks();
1358            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1359        } else if (cc == kEndOfFileMarker) {
1360            parseError();
1361            m_token->setForceQuirks();
1362            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1363        } else {
1364            parseError();
1365            m_token->setForceQuirks();
1366            HTML_ADVANCE_TO(BogusDOCTYPEState);
1367        }
1368    }
1369    END_STATE()
1370
1371    HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1372        if (cc == '"')
1373            HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1374        else if (cc == '>') {
1375            parseError();
1376            m_token->setForceQuirks();
1377            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1378        } else if (cc == kEndOfFileMarker) {
1379            parseError();
1380            m_token->setForceQuirks();
1381            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1382        } else {
1383            m_token->appendToPublicIdentifier(cc);
1384            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1385        }
1386    }
1387    END_STATE()
1388
1389    HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1390        if (cc == '\'')
1391            HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1392        else if (cc == '>') {
1393            parseError();
1394            m_token->setForceQuirks();
1395            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1396        } else if (cc == kEndOfFileMarker) {
1397            parseError();
1398            m_token->setForceQuirks();
1399            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1400        } else {
1401            m_token->appendToPublicIdentifier(cc);
1402            HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1403        }
1404    }
1405    END_STATE()
1406
1407    HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1408        if (isTokenizerWhitespace(cc))
1409            HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1410        else if (cc == '>')
1411            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1412        else if (cc == '"') {
1413            parseError();
1414            m_token->setSystemIdentifierToEmptyString();
1415            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1416        } else if (cc == '\'') {
1417            parseError();
1418            m_token->setSystemIdentifierToEmptyString();
1419            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1420        } else if (cc == kEndOfFileMarker) {
1421            parseError();
1422            m_token->setForceQuirks();
1423            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1424        } else {
1425            parseError();
1426            m_token->setForceQuirks();
1427            HTML_ADVANCE_TO(BogusDOCTYPEState);
1428        }
1429    }
1430    END_STATE()
1431
1432    HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1433        if (isTokenizerWhitespace(cc))
1434            HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1435        else if (cc == '>')
1436            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1437        else if (cc == '"') {
1438            m_token->setSystemIdentifierToEmptyString();
1439            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1440        } else if (cc == '\'') {
1441            m_token->setSystemIdentifierToEmptyString();
1442            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1443        } else if (cc == kEndOfFileMarker) {
1444            parseError();
1445            m_token->setForceQuirks();
1446            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1447        } else {
1448            parseError();
1449            m_token->setForceQuirks();
1450            HTML_ADVANCE_TO(BogusDOCTYPEState);
1451        }
1452    }
1453    END_STATE()
1454
1455    HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1456        if (isTokenizerWhitespace(cc))
1457            HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1458        else if (cc == '"') {
1459            parseError();
1460            m_token->setSystemIdentifierToEmptyString();
1461            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1462        } else if (cc == '\'') {
1463            parseError();
1464            m_token->setSystemIdentifierToEmptyString();
1465            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1466        } else if (cc == '>') {
1467            parseError();
1468            m_token->setForceQuirks();
1469            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1470        } else if (cc == kEndOfFileMarker) {
1471            parseError();
1472            m_token->setForceQuirks();
1473            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1474        } else {
1475            parseError();
1476            m_token->setForceQuirks();
1477            HTML_ADVANCE_TO(BogusDOCTYPEState);
1478        }
1479    }
1480    END_STATE()
1481
1482    HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1483        if (isTokenizerWhitespace(cc))
1484            HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1485        if (cc == '"') {
1486            m_token->setSystemIdentifierToEmptyString();
1487            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1488        } else if (cc == '\'') {
1489            m_token->setSystemIdentifierToEmptyString();
1490            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1491        } else if (cc == '>') {
1492            parseError();
1493            m_token->setForceQuirks();
1494            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1495        } else if (cc == kEndOfFileMarker) {
1496            parseError();
1497            m_token->setForceQuirks();
1498            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1499        } else {
1500            parseError();
1501            m_token->setForceQuirks();
1502            HTML_ADVANCE_TO(BogusDOCTYPEState);
1503        }
1504    }
1505    END_STATE()
1506
1507    HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1508        if (cc == '"')
1509            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1510        else if (cc == '>') {
1511            parseError();
1512            m_token->setForceQuirks();
1513            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1514        } else if (cc == kEndOfFileMarker) {
1515            parseError();
1516            m_token->setForceQuirks();
1517            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1518        } else {
1519            m_token->appendToSystemIdentifier(cc);
1520            HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1521        }
1522    }
1523    END_STATE()
1524
1525    HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1526        if (cc == '\'')
1527            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1528        else if (cc == '>') {
1529            parseError();
1530            m_token->setForceQuirks();
1531            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1532        } else if (cc == kEndOfFileMarker) {
1533            parseError();
1534            m_token->setForceQuirks();
1535            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1536        } else {
1537            m_token->appendToSystemIdentifier(cc);
1538            HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1539        }
1540    }
1541    END_STATE()
1542
1543    HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1544        if (isTokenizerWhitespace(cc))
1545            HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1546        else if (cc == '>')
1547            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1548        else if (cc == kEndOfFileMarker) {
1549            parseError();
1550            m_token->setForceQuirks();
1551            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1552        } else {
1553            parseError();
1554            HTML_ADVANCE_TO(BogusDOCTYPEState);
1555        }
1556    }
1557    END_STATE()
1558
1559    HTML_BEGIN_STATE(BogusDOCTYPEState) {
1560        if (cc == '>')
1561            return emitAndResumeIn(source, HTMLTokenizer::DataState);
1562        else if (cc == kEndOfFileMarker)
1563            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1564        HTML_ADVANCE_TO(BogusDOCTYPEState);
1565    }
1566    END_STATE()
1567
1568    HTML_BEGIN_STATE(CDATASectionState) {
1569        if (cc == ']')
1570            HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1571        else if (cc == kEndOfFileMarker)
1572            HTML_RECONSUME_IN(DataState);
1573        else {
1574            bufferCharacter(cc);
1575            HTML_ADVANCE_TO(CDATASectionState);
1576        }
1577    }
1578    END_STATE()
1579
1580    HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1581        if (cc == ']')
1582            HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1583        else {
1584            bufferCharacter(']');
1585            HTML_RECONSUME_IN(CDATASectionState);
1586        }
1587    }
1588
1589    HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1590        if (cc == '>')
1591            HTML_ADVANCE_TO(DataState);
1592        else {
1593            bufferCharacter(']');
1594            bufferCharacter(']');
1595            HTML_RECONSUME_IN(CDATASectionState);
1596        }
1597    }
1598    END_STATE()
1599
1600    }
1601
1602    ASSERT_NOT_REACHED();
1603    return false;
1604}
1605
1606String HTMLTokenizer::bufferedCharacters() const
1607{
1608    // FIXME: Add an assert about m_state.
1609    StringBuilder characters;
1610    characters.reserveCapacity(numberOfBufferedCharacters());
1611    characters.append('<');
1612    characters.append('/');
1613    characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1614    return characters.toString();
1615}
1616
1617void HTMLTokenizer::updateStateFor(const AtomicString& tagName)
1618{
1619    if (tagName == textareaTag || tagName == titleTag)
1620        setState(HTMLTokenizer::RCDATAState);
1621    else if (tagName == plaintextTag)
1622        setState(HTMLTokenizer::PLAINTEXTState);
1623    else if (tagName == scriptTag)
1624        setState(HTMLTokenizer::ScriptDataState);
1625    else if (tagName == styleTag
1626        || tagName == iframeTag
1627        || tagName == xmpTag
1628        || (tagName == noembedTag && m_options.pluginsEnabled)
1629        || tagName == noframesTag
1630        || (tagName == noscriptTag && m_options.scriptEnabled))
1631        setState(HTMLTokenizer::RAWTEXTState);
1632}
1633
1634inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1635{
1636    return vectorEqualsString(m_temporaryBuffer, expectedString);
1637}
1638
1639inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1640{
1641    ASSERT(isEndTagBufferingState(m_state));
1642    m_bufferedEndTagName.append(cc);
1643}
1644
1645inline bool HTMLTokenizer::isAppropriateEndTag()
1646{
1647    if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1648        return false;
1649
1650    size_t numCharacters = m_bufferedEndTagName.size();
1651
1652    for (size_t i = 0; i < numCharacters; i++) {
1653        if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1654            return false;
1655    }
1656
1657    return true;
1658}
1659
1660inline void HTMLTokenizer::parseError()
1661{
1662    notImplemented();
1663}
1664
1665}
1666