1/* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "HTMLTokenizer.h" 30 31#include "HTMLEntityParser.h" 32#include "HTMLToken.h" 33#include "HTMLTreeBuilder.h" 34#include "HTMLNames.h" 35#include "MarkupTokenizerInlines.h" 36#include "NotImplemented.h" 37#include <wtf/ASCIICType.h> 38#include <wtf/CurrentTime.h> 39#include <wtf/text/AtomicString.h> 40#include <wtf/text/CString.h> 41#include <wtf/unicode/Unicode.h> 42 43using namespace WTF; 44 45namespace WebCore { 46 47using namespace HTMLNames; 48 49// This has to go in a .cpp file, as the linker doesn't like it being included more than once. 50// We don't have an HTMLToken.cpp though, so this is the next best place. 51QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const 52{ 53 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom); 54} 55 56bool AtomicHTMLToken::usesName() const 57{ 58 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 59} 60 61bool AtomicHTMLToken::usesAttributes() const 62{ 63 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 64} 65 66static inline UChar toLowerCase(UChar cc) 67{ 68 ASSERT(isASCIIUpper(cc)); 69 const int lowerCaseOffset = 0x20; 70 return cc + lowerCaseOffset; 71} 72 73static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string) 74{ 75 if (vector.size() != string.length()) 76 return false; 77 78 if (!string.length()) 79 return true; 80 81 return equal(string.impl(), vector.data(), vector.size()); 82} 83 84static inline bool isEndTagBufferingState(HTMLTokenizer::State state) 85{ 86 switch (state) { 87 case HTMLTokenizer::RCDATAEndTagOpenState: 88 case HTMLTokenizer::RCDATAEndTagNameState: 89 case HTMLTokenizer::RAWTEXTEndTagOpenState: 90 case HTMLTokenizer::RAWTEXTEndTagNameState: 91 case HTMLTokenizer::ScriptDataEndTagOpenState: 92 case HTMLTokenizer::ScriptDataEndTagNameState: 93 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 94 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 95 return true; 96 default: 97 return false; 98 } 99} 100 101#define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) 102#define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) 103#define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) 104#define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) 105 106HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options) 107 : m_inputStreamPreprocessor(this) 108 , m_options(options) 109{ 110 reset(); 111} 112 113HTMLTokenizer::~HTMLTokenizer() 114{ 115} 116 117void HTMLTokenizer::reset() 118{ 119 m_state = HTMLTokenizer::DataState; 120 m_token = 0; 121 m_forceNullCharacterReplacement = false; 122 m_shouldAllowCDATA = false; 123 m_additionalAllowedCharacter = '\0'; 124} 125 126#if ENABLE(THREADED_HTML_PARSER) 127 128bool HTMLTokenizer::canCreateCheckpoint() const 129{ 130 if (!m_appropriateEndTagName.isEmpty()) 131 return false; 132 if (!m_temporaryBuffer.isEmpty()) 133 return false; 134 if (!m_bufferedEndTagName.isEmpty()) 135 return false; 136 return true; 137} 138 139void HTMLTokenizer::createCheckpoint(Checkpoint& result) const 140{ 141 ASSERT(canCreateCheckpoint()); 142 result.options = m_options; 143 result.state = m_state; 144 result.additionalAllowedCharacter = m_additionalAllowedCharacter; 145 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine(); 146 result.shouldAllowCDATA = m_shouldAllowCDATA; 147} 148 149void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint) 150{ 151 m_token = 0; 152 m_options = checkpoint.options; 153 m_state = checkpoint.state; 154 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter; 155 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine); 156 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA; 157} 158 159#endif 160 161inline bool HTMLTokenizer::processEntity(SegmentedString& source) 162{ 163 bool notEnoughCharacters = false; 164 StringBuilder decodedEntity; 165 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 166 if (notEnoughCharacters) 167 return false; 168 if (!success) { 169 ASSERT(decodedEntity.isEmpty()); 170 bufferCharacter('&'); 171 } else { 172 for (unsigned i = 0; i < decodedEntity.length(); ++i) 173 bufferCharacter(decodedEntity[i]); 174 } 175 return true; 176} 177 178bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 179{ 180 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 181 source.advanceAndUpdateLineNumber(); 182 if (m_token->type() == HTMLToken::Character) 183 return true; 184 m_token->beginEndTag(m_bufferedEndTagName); 185 m_bufferedEndTagName.clear(); 186 m_appropriateEndTagName.clear(); 187 m_temporaryBuffer.clear(); 188 return false; 189} 190 191#define FLUSH_AND_ADVANCE_TO(stateName) \ 192 do { \ 193 m_state = HTMLTokenizer::stateName; \ 194 if (flushBufferedEndTag(source)) \ 195 return true; \ 196 if (source.isEmpty() \ 197 || !m_inputStreamPreprocessor.peek(source)) \ 198 return haveBufferedCharacterToken(); \ 199 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 200 goto stateName; \ 201 } while (false) 202 203bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) 204{ 205 m_state = state; 206 flushBufferedEndTag(source); 207 return true; 208} 209 210bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 211{ 212 // If we have a token in progress, then we're supposed to be called back 213 // with the same token so we can finish it. 214 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 215 m_token = &token; 216 217 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 218 // FIXME: This should call flushBufferedEndTag(). 219 // We started an end tag during our last iteration. 220 m_token->beginEndTag(m_bufferedEndTagName); 221 m_bufferedEndTagName.clear(); 222 m_appropriateEndTagName.clear(); 223 m_temporaryBuffer.clear(); 224 if (m_state == HTMLTokenizer::DataState) { 225 // We're back in the data state, so we must be done with the tag. 226 return true; 227 } 228 } 229 230 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 231 return haveBufferedCharacterToken(); 232 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 233 234 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 235 switch (m_state) { 236 HTML_BEGIN_STATE(DataState) { 237 if (cc == '&') 238 HTML_ADVANCE_TO(CharacterReferenceInDataState); 239 else if (cc == '<') { 240 if (m_token->type() == HTMLToken::Character) { 241 // We have a bunch of character tokens queued up that we 242 // are emitting lazily here. 243 return true; 244 } 245 HTML_ADVANCE_TO(TagOpenState); 246 } else if (cc == kEndOfFileMarker) 247 return emitEndOfFile(source); 248 else { 249 bufferCharacter(cc); 250 HTML_ADVANCE_TO(DataState); 251 } 252 } 253 END_STATE() 254 255 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 256 if (!processEntity(source)) 257 return haveBufferedCharacterToken(); 258 HTML_SWITCH_TO(DataState); 259 } 260 END_STATE() 261 262 HTML_BEGIN_STATE(RCDATAState) { 263 if (cc == '&') 264 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState); 265 else if (cc == '<') 266 HTML_ADVANCE_TO(RCDATALessThanSignState); 267 else if (cc == kEndOfFileMarker) 268 return emitEndOfFile(source); 269 else { 270 bufferCharacter(cc); 271 HTML_ADVANCE_TO(RCDATAState); 272 } 273 } 274 END_STATE() 275 276 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) { 277 if (!processEntity(source)) 278 return haveBufferedCharacterToken(); 279 HTML_SWITCH_TO(RCDATAState); 280 } 281 END_STATE() 282 283 HTML_BEGIN_STATE(RAWTEXTState) { 284 if (cc == '<') 285 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 286 else if (cc == kEndOfFileMarker) 287 return emitEndOfFile(source); 288 else { 289 bufferCharacter(cc); 290 HTML_ADVANCE_TO(RAWTEXTState); 291 } 292 } 293 END_STATE() 294 295 HTML_BEGIN_STATE(ScriptDataState) { 296 if (cc == '<') 297 HTML_ADVANCE_TO(ScriptDataLessThanSignState); 298 else if (cc == kEndOfFileMarker) 299 return emitEndOfFile(source); 300 else { 301 bufferCharacter(cc); 302 HTML_ADVANCE_TO(ScriptDataState); 303 } 304 } 305 END_STATE() 306 307 HTML_BEGIN_STATE(PLAINTEXTState) { 308 if (cc == kEndOfFileMarker) 309 return emitEndOfFile(source); 310 bufferCharacter(cc); 311 HTML_ADVANCE_TO(PLAINTEXTState); 312 } 313 END_STATE() 314 315 HTML_BEGIN_STATE(TagOpenState) { 316 if (cc == '!') 317 HTML_ADVANCE_TO(MarkupDeclarationOpenState); 318 else if (cc == '/') 319 HTML_ADVANCE_TO(EndTagOpenState); 320 else if (isASCIIUpper(cc)) { 321 m_token->beginStartTag(toLowerCase(cc)); 322 HTML_ADVANCE_TO(TagNameState); 323 } else if (isASCIILower(cc)) { 324 m_token->beginStartTag(cc); 325 HTML_ADVANCE_TO(TagNameState); 326 } else if (cc == '?') { 327 parseError(); 328 // The spec consumes the current character before switching 329 // to the bogus comment state, but it's easier to implement 330 // if we reconsume the current character. 331 HTML_RECONSUME_IN(BogusCommentState); 332 } else { 333 parseError(); 334 bufferCharacter('<'); 335 HTML_RECONSUME_IN(DataState); 336 } 337 } 338 END_STATE() 339 340 HTML_BEGIN_STATE(EndTagOpenState) { 341 if (isASCIIUpper(cc)) { 342 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc))); 343 m_appropriateEndTagName.clear(); 344 HTML_ADVANCE_TO(TagNameState); 345 } else if (isASCIILower(cc)) { 346 m_token->beginEndTag(static_cast<LChar>(cc)); 347 m_appropriateEndTagName.clear(); 348 HTML_ADVANCE_TO(TagNameState); 349 } else if (cc == '>') { 350 parseError(); 351 HTML_ADVANCE_TO(DataState); 352 } else if (cc == kEndOfFileMarker) { 353 parseError(); 354 bufferCharacter('<'); 355 bufferCharacter('/'); 356 HTML_RECONSUME_IN(DataState); 357 } else { 358 parseError(); 359 HTML_RECONSUME_IN(BogusCommentState); 360 } 361 } 362 END_STATE() 363 364 HTML_BEGIN_STATE(TagNameState) { 365 if (isTokenizerWhitespace(cc)) 366 HTML_ADVANCE_TO(BeforeAttributeNameState); 367 else if (cc == '/') 368 HTML_ADVANCE_TO(SelfClosingStartTagState); 369 else if (cc == '>') 370 return emitAndResumeIn(source, HTMLTokenizer::DataState); 371 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 372 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 373 else if (isASCIIUpper(cc)) { 374 m_token->appendToName(toLowerCase(cc)); 375 HTML_ADVANCE_TO(TagNameState); 376 } else if (cc == kEndOfFileMarker) { 377 parseError(); 378 HTML_RECONSUME_IN(DataState); 379 } else { 380 m_token->appendToName(cc); 381 HTML_ADVANCE_TO(TagNameState); 382 } 383 } 384 END_STATE() 385 386 HTML_BEGIN_STATE(RCDATALessThanSignState) { 387 if (cc == '/') { 388 m_temporaryBuffer.clear(); 389 ASSERT(m_bufferedEndTagName.isEmpty()); 390 HTML_ADVANCE_TO(RCDATAEndTagOpenState); 391 } else { 392 bufferCharacter('<'); 393 HTML_RECONSUME_IN(RCDATAState); 394 } 395 } 396 END_STATE() 397 398 HTML_BEGIN_STATE(RCDATAEndTagOpenState) { 399 if (isASCIIUpper(cc)) { 400 m_temporaryBuffer.append(static_cast<LChar>(cc)); 401 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 402 HTML_ADVANCE_TO(RCDATAEndTagNameState); 403 } else if (isASCIILower(cc)) { 404 m_temporaryBuffer.append(static_cast<LChar>(cc)); 405 addToPossibleEndTag(static_cast<LChar>(cc)); 406 HTML_ADVANCE_TO(RCDATAEndTagNameState); 407 } else { 408 bufferCharacter('<'); 409 bufferCharacter('/'); 410 HTML_RECONSUME_IN(RCDATAState); 411 } 412 } 413 END_STATE() 414 415 HTML_BEGIN_STATE(RCDATAEndTagNameState) { 416 if (isASCIIUpper(cc)) { 417 m_temporaryBuffer.append(static_cast<LChar>(cc)); 418 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 419 HTML_ADVANCE_TO(RCDATAEndTagNameState); 420 } else if (isASCIILower(cc)) { 421 m_temporaryBuffer.append(static_cast<LChar>(cc)); 422 addToPossibleEndTag(static_cast<LChar>(cc)); 423 HTML_ADVANCE_TO(RCDATAEndTagNameState); 424 } else { 425 if (isTokenizerWhitespace(cc)) { 426 if (isAppropriateEndTag()) { 427 m_temporaryBuffer.append(static_cast<LChar>(cc)); 428 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 429 } 430 } else if (cc == '/') { 431 if (isAppropriateEndTag()) { 432 m_temporaryBuffer.append(static_cast<LChar>(cc)); 433 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 434 } 435 } else if (cc == '>') { 436 if (isAppropriateEndTag()) { 437 m_temporaryBuffer.append(static_cast<LChar>(cc)); 438 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 439 } 440 } 441 bufferCharacter('<'); 442 bufferCharacter('/'); 443 m_token->appendToCharacter(m_temporaryBuffer); 444 m_bufferedEndTagName.clear(); 445 m_temporaryBuffer.clear(); 446 HTML_RECONSUME_IN(RCDATAState); 447 } 448 } 449 END_STATE() 450 451 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) { 452 if (cc == '/') { 453 m_temporaryBuffer.clear(); 454 ASSERT(m_bufferedEndTagName.isEmpty()); 455 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState); 456 } else { 457 bufferCharacter('<'); 458 HTML_RECONSUME_IN(RAWTEXTState); 459 } 460 } 461 END_STATE() 462 463 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) { 464 if (isASCIIUpper(cc)) { 465 m_temporaryBuffer.append(static_cast<LChar>(cc)); 466 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 467 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 468 } else if (isASCIILower(cc)) { 469 m_temporaryBuffer.append(static_cast<LChar>(cc)); 470 addToPossibleEndTag(static_cast<LChar>(cc)); 471 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 472 } else { 473 bufferCharacter('<'); 474 bufferCharacter('/'); 475 HTML_RECONSUME_IN(RAWTEXTState); 476 } 477 } 478 END_STATE() 479 480 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) { 481 if (isASCIIUpper(cc)) { 482 m_temporaryBuffer.append(static_cast<LChar>(cc)); 483 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 484 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 485 } else if (isASCIILower(cc)) { 486 m_temporaryBuffer.append(static_cast<LChar>(cc)); 487 addToPossibleEndTag(static_cast<LChar>(cc)); 488 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 489 } else { 490 if (isTokenizerWhitespace(cc)) { 491 if (isAppropriateEndTag()) { 492 m_temporaryBuffer.append(static_cast<LChar>(cc)); 493 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 494 } 495 } else if (cc == '/') { 496 if (isAppropriateEndTag()) { 497 m_temporaryBuffer.append(static_cast<LChar>(cc)); 498 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 499 } 500 } else if (cc == '>') { 501 if (isAppropriateEndTag()) { 502 m_temporaryBuffer.append(static_cast<LChar>(cc)); 503 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 504 } 505 } 506 bufferCharacter('<'); 507 bufferCharacter('/'); 508 m_token->appendToCharacter(m_temporaryBuffer); 509 m_bufferedEndTagName.clear(); 510 m_temporaryBuffer.clear(); 511 HTML_RECONSUME_IN(RAWTEXTState); 512 } 513 } 514 END_STATE() 515 516 HTML_BEGIN_STATE(ScriptDataLessThanSignState) { 517 if (cc == '/') { 518 m_temporaryBuffer.clear(); 519 ASSERT(m_bufferedEndTagName.isEmpty()); 520 HTML_ADVANCE_TO(ScriptDataEndTagOpenState); 521 } else if (cc == '!') { 522 bufferCharacter('<'); 523 bufferCharacter('!'); 524 HTML_ADVANCE_TO(ScriptDataEscapeStartState); 525 } else { 526 bufferCharacter('<'); 527 HTML_RECONSUME_IN(ScriptDataState); 528 } 529 } 530 END_STATE() 531 532 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) { 533 if (isASCIIUpper(cc)) { 534 m_temporaryBuffer.append(static_cast<LChar>(cc)); 535 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 536 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 537 } else if (isASCIILower(cc)) { 538 m_temporaryBuffer.append(static_cast<LChar>(cc)); 539 addToPossibleEndTag(static_cast<LChar>(cc)); 540 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 541 } else { 542 bufferCharacter('<'); 543 bufferCharacter('/'); 544 HTML_RECONSUME_IN(ScriptDataState); 545 } 546 } 547 END_STATE() 548 549 HTML_BEGIN_STATE(ScriptDataEndTagNameState) { 550 if (isASCIIUpper(cc)) { 551 m_temporaryBuffer.append(static_cast<LChar>(cc)); 552 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 553 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 554 } else if (isASCIILower(cc)) { 555 m_temporaryBuffer.append(static_cast<LChar>(cc)); 556 addToPossibleEndTag(static_cast<LChar>(cc)); 557 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 558 } else { 559 if (isTokenizerWhitespace(cc)) { 560 if (isAppropriateEndTag()) { 561 m_temporaryBuffer.append(static_cast<LChar>(cc)); 562 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 563 } 564 } else if (cc == '/') { 565 if (isAppropriateEndTag()) { 566 m_temporaryBuffer.append(static_cast<LChar>(cc)); 567 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 568 } 569 } else if (cc == '>') { 570 if (isAppropriateEndTag()) { 571 m_temporaryBuffer.append(static_cast<LChar>(cc)); 572 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 573 } 574 } 575 bufferCharacter('<'); 576 bufferCharacter('/'); 577 m_token->appendToCharacter(m_temporaryBuffer); 578 m_bufferedEndTagName.clear(); 579 m_temporaryBuffer.clear(); 580 HTML_RECONSUME_IN(ScriptDataState); 581 } 582 } 583 END_STATE() 584 585 HTML_BEGIN_STATE(ScriptDataEscapeStartState) { 586 if (cc == '-') { 587 bufferCharacter(cc); 588 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState); 589 } else 590 HTML_RECONSUME_IN(ScriptDataState); 591 } 592 END_STATE() 593 594 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) { 595 if (cc == '-') { 596 bufferCharacter(cc); 597 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 598 } else 599 HTML_RECONSUME_IN(ScriptDataState); 600 } 601 END_STATE() 602 603 HTML_BEGIN_STATE(ScriptDataEscapedState) { 604 if (cc == '-') { 605 bufferCharacter(cc); 606 HTML_ADVANCE_TO(ScriptDataEscapedDashState); 607 } else if (cc == '<') 608 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 609 else if (cc == kEndOfFileMarker) { 610 parseError(); 611 HTML_RECONSUME_IN(DataState); 612 } else { 613 bufferCharacter(cc); 614 HTML_ADVANCE_TO(ScriptDataEscapedState); 615 } 616 } 617 END_STATE() 618 619 HTML_BEGIN_STATE(ScriptDataEscapedDashState) { 620 if (cc == '-') { 621 bufferCharacter(cc); 622 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 623 } else if (cc == '<') 624 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 625 else if (cc == kEndOfFileMarker) { 626 parseError(); 627 HTML_RECONSUME_IN(DataState); 628 } else { 629 bufferCharacter(cc); 630 HTML_ADVANCE_TO(ScriptDataEscapedState); 631 } 632 } 633 END_STATE() 634 635 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) { 636 if (cc == '-') { 637 bufferCharacter(cc); 638 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 639 } else if (cc == '<') 640 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 641 else if (cc == '>') { 642 bufferCharacter(cc); 643 HTML_ADVANCE_TO(ScriptDataState); 644 } else if (cc == kEndOfFileMarker) { 645 parseError(); 646 HTML_RECONSUME_IN(DataState); 647 } else { 648 bufferCharacter(cc); 649 HTML_ADVANCE_TO(ScriptDataEscapedState); 650 } 651 } 652 END_STATE() 653 654 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 655 if (cc == '/') { 656 m_temporaryBuffer.clear(); 657 ASSERT(m_bufferedEndTagName.isEmpty()); 658 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 659 } else if (isASCIIUpper(cc)) { 660 bufferCharacter('<'); 661 bufferCharacter(cc); 662 m_temporaryBuffer.clear(); 663 m_temporaryBuffer.append(toLowerCase(cc)); 664 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 665 } else if (isASCIILower(cc)) { 666 bufferCharacter('<'); 667 bufferCharacter(cc); 668 m_temporaryBuffer.clear(); 669 m_temporaryBuffer.append(static_cast<LChar>(cc)); 670 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 671 } else { 672 bufferCharacter('<'); 673 HTML_RECONSUME_IN(ScriptDataEscapedState); 674 } 675 } 676 END_STATE() 677 678 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 679 if (isASCIIUpper(cc)) { 680 m_temporaryBuffer.append(static_cast<LChar>(cc)); 681 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 682 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 683 } else if (isASCIILower(cc)) { 684 m_temporaryBuffer.append(static_cast<LChar>(cc)); 685 addToPossibleEndTag(static_cast<LChar>(cc)); 686 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 687 } else { 688 bufferCharacter('<'); 689 bufferCharacter('/'); 690 HTML_RECONSUME_IN(ScriptDataEscapedState); 691 } 692 } 693 END_STATE() 694 695 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 696 if (isASCIIUpper(cc)) { 697 m_temporaryBuffer.append(static_cast<LChar>(cc)); 698 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 699 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 700 } else if (isASCIILower(cc)) { 701 m_temporaryBuffer.append(static_cast<LChar>(cc)); 702 addToPossibleEndTag(static_cast<LChar>(cc)); 703 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 704 } else { 705 if (isTokenizerWhitespace(cc)) { 706 if (isAppropriateEndTag()) { 707 m_temporaryBuffer.append(static_cast<LChar>(cc)); 708 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 709 } 710 } else if (cc == '/') { 711 if (isAppropriateEndTag()) { 712 m_temporaryBuffer.append(static_cast<LChar>(cc)); 713 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 714 } 715 } else if (cc == '>') { 716 if (isAppropriateEndTag()) { 717 m_temporaryBuffer.append(static_cast<LChar>(cc)); 718 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 719 } 720 } 721 bufferCharacter('<'); 722 bufferCharacter('/'); 723 m_token->appendToCharacter(m_temporaryBuffer); 724 m_bufferedEndTagName.clear(); 725 m_temporaryBuffer.clear(); 726 HTML_RECONSUME_IN(ScriptDataEscapedState); 727 } 728 } 729 END_STATE() 730 731 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 732 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 733 bufferCharacter(cc); 734 if (temporaryBufferIs(scriptTag.localName())) 735 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 736 else 737 HTML_ADVANCE_TO(ScriptDataEscapedState); 738 } else if (isASCIIUpper(cc)) { 739 bufferCharacter(cc); 740 m_temporaryBuffer.append(toLowerCase(cc)); 741 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 742 } else if (isASCIILower(cc)) { 743 bufferCharacter(cc); 744 m_temporaryBuffer.append(static_cast<LChar>(cc)); 745 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 746 } else 747 HTML_RECONSUME_IN(ScriptDataEscapedState); 748 } 749 END_STATE() 750 751 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) { 752 if (cc == '-') { 753 bufferCharacter(cc); 754 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState); 755 } else if (cc == '<') { 756 bufferCharacter(cc); 757 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 758 } else if (cc == kEndOfFileMarker) { 759 parseError(); 760 HTML_RECONSUME_IN(DataState); 761 } else { 762 bufferCharacter(cc); 763 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 764 } 765 } 766 END_STATE() 767 768 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 769 if (cc == '-') { 770 bufferCharacter(cc); 771 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 772 } else if (cc == '<') { 773 bufferCharacter(cc); 774 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 775 } else if (cc == kEndOfFileMarker) { 776 parseError(); 777 HTML_RECONSUME_IN(DataState); 778 } else { 779 bufferCharacter(cc); 780 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 781 } 782 } 783 END_STATE() 784 785 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 786 if (cc == '-') { 787 bufferCharacter(cc); 788 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 789 } else if (cc == '<') { 790 bufferCharacter(cc); 791 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 792 } else if (cc == '>') { 793 bufferCharacter(cc); 794 HTML_ADVANCE_TO(ScriptDataState); 795 } else if (cc == kEndOfFileMarker) { 796 parseError(); 797 HTML_RECONSUME_IN(DataState); 798 } else { 799 bufferCharacter(cc); 800 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 801 } 802 } 803 END_STATE() 804 805 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 806 if (cc == '/') { 807 bufferCharacter(cc); 808 m_temporaryBuffer.clear(); 809 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 810 } else 811 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 812 } 813 END_STATE() 814 815 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 816 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 817 bufferCharacter(cc); 818 if (temporaryBufferIs(scriptTag.localName())) 819 HTML_ADVANCE_TO(ScriptDataEscapedState); 820 else 821 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 822 } else if (isASCIIUpper(cc)) { 823 bufferCharacter(cc); 824 m_temporaryBuffer.append(toLowerCase(cc)); 825 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 826 } else if (isASCIILower(cc)) { 827 bufferCharacter(cc); 828 m_temporaryBuffer.append(static_cast<LChar>(cc)); 829 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 830 } else 831 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 832 } 833 END_STATE() 834 835 HTML_BEGIN_STATE(BeforeAttributeNameState) { 836 if (isTokenizerWhitespace(cc)) 837 HTML_ADVANCE_TO(BeforeAttributeNameState); 838 else if (cc == '/') 839 HTML_ADVANCE_TO(SelfClosingStartTagState); 840 else if (cc == '>') 841 return emitAndResumeIn(source, HTMLTokenizer::DataState); 842 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 843 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 844 else if (isASCIIUpper(cc)) { 845 m_token->addNewAttribute(); 846 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 847 m_token->appendToAttributeName(toLowerCase(cc)); 848 HTML_ADVANCE_TO(AttributeNameState); 849 } else if (cc == kEndOfFileMarker) { 850 parseError(); 851 HTML_RECONSUME_IN(DataState); 852 } else { 853 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 854 parseError(); 855 m_token->addNewAttribute(); 856 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 857 m_token->appendToAttributeName(cc); 858 HTML_ADVANCE_TO(AttributeNameState); 859 } 860 } 861 END_STATE() 862 863 HTML_BEGIN_STATE(AttributeNameState) { 864 if (isTokenizerWhitespace(cc)) { 865 m_token->endAttributeName(source.numberOfCharactersConsumed()); 866 HTML_ADVANCE_TO(AfterAttributeNameState); 867 } else if (cc == '/') { 868 m_token->endAttributeName(source.numberOfCharactersConsumed()); 869 HTML_ADVANCE_TO(SelfClosingStartTagState); 870 } else if (cc == '=') { 871 m_token->endAttributeName(source.numberOfCharactersConsumed()); 872 HTML_ADVANCE_TO(BeforeAttributeValueState); 873 } else if (cc == '>') { 874 m_token->endAttributeName(source.numberOfCharactersConsumed()); 875 return emitAndResumeIn(source, HTMLTokenizer::DataState); 876 } else if (m_options.usePreHTML5ParserQuirks && cc == '<') { 877 m_token->endAttributeName(source.numberOfCharactersConsumed()); 878 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 879 } else if (isASCIIUpper(cc)) { 880 m_token->appendToAttributeName(toLowerCase(cc)); 881 HTML_ADVANCE_TO(AttributeNameState); 882 } else if (cc == kEndOfFileMarker) { 883 parseError(); 884 m_token->endAttributeName(source.numberOfCharactersConsumed()); 885 HTML_RECONSUME_IN(DataState); 886 } else { 887 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 888 parseError(); 889 m_token->appendToAttributeName(cc); 890 HTML_ADVANCE_TO(AttributeNameState); 891 } 892 } 893 END_STATE() 894 895 HTML_BEGIN_STATE(AfterAttributeNameState) { 896 if (isTokenizerWhitespace(cc)) 897 HTML_ADVANCE_TO(AfterAttributeNameState); 898 else if (cc == '/') 899 HTML_ADVANCE_TO(SelfClosingStartTagState); 900 else if (cc == '=') 901 HTML_ADVANCE_TO(BeforeAttributeValueState); 902 else if (cc == '>') 903 return emitAndResumeIn(source, HTMLTokenizer::DataState); 904 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 905 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 906 else if (isASCIIUpper(cc)) { 907 m_token->addNewAttribute(); 908 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 909 m_token->appendToAttributeName(toLowerCase(cc)); 910 HTML_ADVANCE_TO(AttributeNameState); 911 } else if (cc == kEndOfFileMarker) { 912 parseError(); 913 HTML_RECONSUME_IN(DataState); 914 } else { 915 if (cc == '"' || cc == '\'' || cc == '<') 916 parseError(); 917 m_token->addNewAttribute(); 918 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 919 m_token->appendToAttributeName(cc); 920 HTML_ADVANCE_TO(AttributeNameState); 921 } 922 } 923 END_STATE() 924 925 HTML_BEGIN_STATE(BeforeAttributeValueState) { 926 if (isTokenizerWhitespace(cc)) 927 HTML_ADVANCE_TO(BeforeAttributeValueState); 928 else if (cc == '"') { 929 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 930 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 931 } else if (cc == '&') { 932 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 933 HTML_RECONSUME_IN(AttributeValueUnquotedState); 934 } else if (cc == '\'') { 935 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 936 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 937 } else if (cc == '>') { 938 parseError(); 939 return emitAndResumeIn(source, HTMLTokenizer::DataState); 940 } else if (cc == kEndOfFileMarker) { 941 parseError(); 942 HTML_RECONSUME_IN(DataState); 943 } else { 944 if (cc == '<' || cc == '=' || cc == '`') 945 parseError(); 946 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 947 m_token->appendToAttributeValue(cc); 948 HTML_ADVANCE_TO(AttributeValueUnquotedState); 949 } 950 } 951 END_STATE() 952 953 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 954 if (cc == '"') { 955 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 956 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 957 } else if (cc == '&') { 958 m_additionalAllowedCharacter = '"'; 959 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 960 } else if (cc == kEndOfFileMarker) { 961 parseError(); 962 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 963 HTML_RECONSUME_IN(DataState); 964 } else { 965 m_token->appendToAttributeValue(cc); 966 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 967 } 968 } 969 END_STATE() 970 971 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 972 if (cc == '\'') { 973 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 974 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 975 } else if (cc == '&') { 976 m_additionalAllowedCharacter = '\''; 977 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 978 } else if (cc == kEndOfFileMarker) { 979 parseError(); 980 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 981 HTML_RECONSUME_IN(DataState); 982 } else { 983 m_token->appendToAttributeValue(cc); 984 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 985 } 986 } 987 END_STATE() 988 989 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 990 if (isTokenizerWhitespace(cc)) { 991 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 992 HTML_ADVANCE_TO(BeforeAttributeNameState); 993 } else if (cc == '&') { 994 m_additionalAllowedCharacter = '>'; 995 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 996 } else if (cc == '>') { 997 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 998 return emitAndResumeIn(source, HTMLTokenizer::DataState); 999 } else if (cc == kEndOfFileMarker) { 1000 parseError(); 1001 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1002 HTML_RECONSUME_IN(DataState); 1003 } else { 1004 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 1005 parseError(); 1006 m_token->appendToAttributeValue(cc); 1007 HTML_ADVANCE_TO(AttributeValueUnquotedState); 1008 } 1009 } 1010 END_STATE() 1011 1012 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { 1013 bool notEnoughCharacters = false; 1014 StringBuilder decodedEntity; 1015 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 1016 if (notEnoughCharacters) 1017 return haveBufferedCharacterToken(); 1018 if (!success) { 1019 ASSERT(decodedEntity.isEmpty()); 1020 m_token->appendToAttributeValue('&'); 1021 } else { 1022 for (unsigned i = 0; i < decodedEntity.length(); ++i) 1023 m_token->appendToAttributeValue(decodedEntity[i]); 1024 } 1025 // We're supposed to switch back to the attribute value state that 1026 // we were in when we were switched into this state. Rather than 1027 // keeping track of this explictly, we observe that the previous 1028 // state can be determined by m_additionalAllowedCharacter. 1029 if (m_additionalAllowedCharacter == '"') 1030 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); 1031 else if (m_additionalAllowedCharacter == '\'') 1032 HTML_SWITCH_TO(AttributeValueSingleQuotedState); 1033 else if (m_additionalAllowedCharacter == '>') 1034 HTML_SWITCH_TO(AttributeValueUnquotedState); 1035 else 1036 ASSERT_NOT_REACHED(); 1037 } 1038 END_STATE() 1039 1040 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 1041 if (isTokenizerWhitespace(cc)) 1042 HTML_ADVANCE_TO(BeforeAttributeNameState); 1043 else if (cc == '/') 1044 HTML_ADVANCE_TO(SelfClosingStartTagState); 1045 else if (cc == '>') 1046 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1047 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 1048 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1049 else if (cc == kEndOfFileMarker) { 1050 parseError(); 1051 HTML_RECONSUME_IN(DataState); 1052 } else { 1053 parseError(); 1054 HTML_RECONSUME_IN(BeforeAttributeNameState); 1055 } 1056 } 1057 END_STATE() 1058 1059 HTML_BEGIN_STATE(SelfClosingStartTagState) { 1060 if (cc == '>') { 1061 m_token->setSelfClosing(); 1062 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1063 } else if (cc == kEndOfFileMarker) { 1064 parseError(); 1065 HTML_RECONSUME_IN(DataState); 1066 } else { 1067 parseError(); 1068 HTML_RECONSUME_IN(BeforeAttributeNameState); 1069 } 1070 } 1071 END_STATE() 1072 1073 HTML_BEGIN_STATE(BogusCommentState) { 1074 m_token->beginComment(); 1075 HTML_RECONSUME_IN(ContinueBogusCommentState); 1076 } 1077 END_STATE() 1078 1079 HTML_BEGIN_STATE(ContinueBogusCommentState) { 1080 if (cc == '>') 1081 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1082 else if (cc == kEndOfFileMarker) 1083 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1084 else { 1085 m_token->appendToComment(cc); 1086 HTML_ADVANCE_TO(ContinueBogusCommentState); 1087 } 1088 } 1089 END_STATE() 1090 1091 HTML_BEGIN_STATE(MarkupDeclarationOpenState) { 1092 DEFINE_STATIC_LOCAL(String, dashDashString, (ASCIILiteral("--"))); 1093 DEFINE_STATIC_LOCAL(String, doctypeString, (ASCIILiteral("doctype"))); 1094 DEFINE_STATIC_LOCAL(String, cdataString, (ASCIILiteral("[CDATA["))); 1095 if (cc == '-') { 1096 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); 1097 if (result == SegmentedString::DidMatch) { 1098 source.advanceAndASSERT('-'); 1099 source.advanceAndASSERT('-'); 1100 m_token->beginComment(); 1101 HTML_SWITCH_TO(CommentStartState); 1102 } else if (result == SegmentedString::NotEnoughCharacters) 1103 return haveBufferedCharacterToken(); 1104 } else if (cc == 'D' || cc == 'd') { 1105 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); 1106 if (result == SegmentedString::DidMatch) { 1107 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1108 HTML_SWITCH_TO(DOCTYPEState); 1109 } else if (result == SegmentedString::NotEnoughCharacters) 1110 return haveBufferedCharacterToken(); 1111 } else if (cc == '[' && shouldAllowCDATA()) { 1112 SegmentedString::LookAheadResult result = source.lookAhead(cdataString); 1113 if (result == SegmentedString::DidMatch) { 1114 advanceStringAndASSERT(source, "[CDATA["); 1115 HTML_SWITCH_TO(CDATASectionState); 1116 } else if (result == SegmentedString::NotEnoughCharacters) 1117 return haveBufferedCharacterToken(); 1118 } 1119 parseError(); 1120 HTML_RECONSUME_IN(BogusCommentState); 1121 } 1122 END_STATE() 1123 1124 HTML_BEGIN_STATE(CommentStartState) { 1125 if (cc == '-') 1126 HTML_ADVANCE_TO(CommentStartDashState); 1127 else if (cc == '>') { 1128 parseError(); 1129 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1130 } else if (cc == kEndOfFileMarker) { 1131 parseError(); 1132 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1133 } else { 1134 m_token->appendToComment(cc); 1135 HTML_ADVANCE_TO(CommentState); 1136 } 1137 } 1138 END_STATE() 1139 1140 HTML_BEGIN_STATE(CommentStartDashState) { 1141 if (cc == '-') 1142 HTML_ADVANCE_TO(CommentEndState); 1143 else if (cc == '>') { 1144 parseError(); 1145 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1146 } else if (cc == kEndOfFileMarker) { 1147 parseError(); 1148 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1149 } else { 1150 m_token->appendToComment('-'); 1151 m_token->appendToComment(cc); 1152 HTML_ADVANCE_TO(CommentState); 1153 } 1154 } 1155 END_STATE() 1156 1157 HTML_BEGIN_STATE(CommentState) { 1158 if (cc == '-') 1159 HTML_ADVANCE_TO(CommentEndDashState); 1160 else if (cc == kEndOfFileMarker) { 1161 parseError(); 1162 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1163 } else { 1164 m_token->appendToComment(cc); 1165 HTML_ADVANCE_TO(CommentState); 1166 } 1167 } 1168 END_STATE() 1169 1170 HTML_BEGIN_STATE(CommentEndDashState) { 1171 if (cc == '-') 1172 HTML_ADVANCE_TO(CommentEndState); 1173 else if (cc == kEndOfFileMarker) { 1174 parseError(); 1175 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1176 } else { 1177 m_token->appendToComment('-'); 1178 m_token->appendToComment(cc); 1179 HTML_ADVANCE_TO(CommentState); 1180 } 1181 } 1182 END_STATE() 1183 1184 HTML_BEGIN_STATE(CommentEndState) { 1185 if (cc == '>') 1186 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1187 else if (cc == '!') { 1188 parseError(); 1189 HTML_ADVANCE_TO(CommentEndBangState); 1190 } else if (cc == '-') { 1191 parseError(); 1192 m_token->appendToComment('-'); 1193 HTML_ADVANCE_TO(CommentEndState); 1194 } else if (cc == kEndOfFileMarker) { 1195 parseError(); 1196 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1197 } else { 1198 parseError(); 1199 m_token->appendToComment('-'); 1200 m_token->appendToComment('-'); 1201 m_token->appendToComment(cc); 1202 HTML_ADVANCE_TO(CommentState); 1203 } 1204 } 1205 END_STATE() 1206 1207 HTML_BEGIN_STATE(CommentEndBangState) { 1208 if (cc == '-') { 1209 m_token->appendToComment('-'); 1210 m_token->appendToComment('-'); 1211 m_token->appendToComment('!'); 1212 HTML_ADVANCE_TO(CommentEndDashState); 1213 } else if (cc == '>') 1214 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1215 else if (cc == kEndOfFileMarker) { 1216 parseError(); 1217 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1218 } else { 1219 m_token->appendToComment('-'); 1220 m_token->appendToComment('-'); 1221 m_token->appendToComment('!'); 1222 m_token->appendToComment(cc); 1223 HTML_ADVANCE_TO(CommentState); 1224 } 1225 } 1226 END_STATE() 1227 1228 HTML_BEGIN_STATE(DOCTYPEState) { 1229 if (isTokenizerWhitespace(cc)) 1230 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1231 else if (cc == kEndOfFileMarker) { 1232 parseError(); 1233 m_token->beginDOCTYPE(); 1234 m_token->setForceQuirks(); 1235 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1236 } else { 1237 parseError(); 1238 HTML_RECONSUME_IN(BeforeDOCTYPENameState); 1239 } 1240 } 1241 END_STATE() 1242 1243 HTML_BEGIN_STATE(BeforeDOCTYPENameState) { 1244 if (isTokenizerWhitespace(cc)) 1245 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1246 else if (isASCIIUpper(cc)) { 1247 m_token->beginDOCTYPE(toLowerCase(cc)); 1248 HTML_ADVANCE_TO(DOCTYPENameState); 1249 } else if (cc == '>') { 1250 parseError(); 1251 m_token->beginDOCTYPE(); 1252 m_token->setForceQuirks(); 1253 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1254 } else if (cc == kEndOfFileMarker) { 1255 parseError(); 1256 m_token->beginDOCTYPE(); 1257 m_token->setForceQuirks(); 1258 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1259 } else { 1260 m_token->beginDOCTYPE(cc); 1261 HTML_ADVANCE_TO(DOCTYPENameState); 1262 } 1263 } 1264 END_STATE() 1265 1266 HTML_BEGIN_STATE(DOCTYPENameState) { 1267 if (isTokenizerWhitespace(cc)) 1268 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1269 else if (cc == '>') 1270 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1271 else if (isASCIIUpper(cc)) { 1272 m_token->appendToName(toLowerCase(cc)); 1273 HTML_ADVANCE_TO(DOCTYPENameState); 1274 } else if (cc == kEndOfFileMarker) { 1275 parseError(); 1276 m_token->setForceQuirks(); 1277 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1278 } else { 1279 m_token->appendToName(cc); 1280 HTML_ADVANCE_TO(DOCTYPENameState); 1281 } 1282 } 1283 END_STATE() 1284 1285 HTML_BEGIN_STATE(AfterDOCTYPENameState) { 1286 if (isTokenizerWhitespace(cc)) 1287 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1288 if (cc == '>') 1289 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1290 else if (cc == kEndOfFileMarker) { 1291 parseError(); 1292 m_token->setForceQuirks(); 1293 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1294 } else { 1295 DEFINE_STATIC_LOCAL(String, publicString, (ASCIILiteral("public"))); 1296 DEFINE_STATIC_LOCAL(String, systemString, (ASCIILiteral("system"))); 1297 if (cc == 'P' || cc == 'p') { 1298 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); 1299 if (result == SegmentedString::DidMatch) { 1300 advanceStringAndASSERTIgnoringCase(source, "public"); 1301 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1302 } else if (result == SegmentedString::NotEnoughCharacters) 1303 return haveBufferedCharacterToken(); 1304 } else if (cc == 'S' || cc == 's') { 1305 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); 1306 if (result == SegmentedString::DidMatch) { 1307 advanceStringAndASSERTIgnoringCase(source, "system"); 1308 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState); 1309 } else if (result == SegmentedString::NotEnoughCharacters) 1310 return haveBufferedCharacterToken(); 1311 } 1312 parseError(); 1313 m_token->setForceQuirks(); 1314 HTML_ADVANCE_TO(BogusDOCTYPEState); 1315 } 1316 } 1317 END_STATE() 1318 1319 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1320 if (isTokenizerWhitespace(cc)) 1321 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1322 else if (cc == '"') { 1323 parseError(); 1324 m_token->setPublicIdentifierToEmptyString(); 1325 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1326 } else if (cc == '\'') { 1327 parseError(); 1328 m_token->setPublicIdentifierToEmptyString(); 1329 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1330 } else if (cc == '>') { 1331 parseError(); 1332 m_token->setForceQuirks(); 1333 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1334 } else if (cc == kEndOfFileMarker) { 1335 parseError(); 1336 m_token->setForceQuirks(); 1337 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1338 } else { 1339 parseError(); 1340 m_token->setForceQuirks(); 1341 HTML_ADVANCE_TO(BogusDOCTYPEState); 1342 } 1343 } 1344 END_STATE() 1345 1346 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1347 if (isTokenizerWhitespace(cc)) 1348 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1349 else if (cc == '"') { 1350 m_token->setPublicIdentifierToEmptyString(); 1351 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1352 } else if (cc == '\'') { 1353 m_token->setPublicIdentifierToEmptyString(); 1354 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1355 } else if (cc == '>') { 1356 parseError(); 1357 m_token->setForceQuirks(); 1358 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1359 } else if (cc == kEndOfFileMarker) { 1360 parseError(); 1361 m_token->setForceQuirks(); 1362 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1363 } else { 1364 parseError(); 1365 m_token->setForceQuirks(); 1366 HTML_ADVANCE_TO(BogusDOCTYPEState); 1367 } 1368 } 1369 END_STATE() 1370 1371 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1372 if (cc == '"') 1373 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1374 else if (cc == '>') { 1375 parseError(); 1376 m_token->setForceQuirks(); 1377 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1378 } else if (cc == kEndOfFileMarker) { 1379 parseError(); 1380 m_token->setForceQuirks(); 1381 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1382 } else { 1383 m_token->appendToPublicIdentifier(cc); 1384 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1385 } 1386 } 1387 END_STATE() 1388 1389 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1390 if (cc == '\'') 1391 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1392 else if (cc == '>') { 1393 parseError(); 1394 m_token->setForceQuirks(); 1395 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1396 } else if (cc == kEndOfFileMarker) { 1397 parseError(); 1398 m_token->setForceQuirks(); 1399 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1400 } else { 1401 m_token->appendToPublicIdentifier(cc); 1402 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1403 } 1404 } 1405 END_STATE() 1406 1407 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1408 if (isTokenizerWhitespace(cc)) 1409 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1410 else if (cc == '>') 1411 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1412 else if (cc == '"') { 1413 parseError(); 1414 m_token->setSystemIdentifierToEmptyString(); 1415 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1416 } else if (cc == '\'') { 1417 parseError(); 1418 m_token->setSystemIdentifierToEmptyString(); 1419 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1420 } else if (cc == kEndOfFileMarker) { 1421 parseError(); 1422 m_token->setForceQuirks(); 1423 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1424 } else { 1425 parseError(); 1426 m_token->setForceQuirks(); 1427 HTML_ADVANCE_TO(BogusDOCTYPEState); 1428 } 1429 } 1430 END_STATE() 1431 1432 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1433 if (isTokenizerWhitespace(cc)) 1434 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1435 else if (cc == '>') 1436 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1437 else if (cc == '"') { 1438 m_token->setSystemIdentifierToEmptyString(); 1439 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1440 } else if (cc == '\'') { 1441 m_token->setSystemIdentifierToEmptyString(); 1442 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1443 } else if (cc == kEndOfFileMarker) { 1444 parseError(); 1445 m_token->setForceQuirks(); 1446 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1447 } else { 1448 parseError(); 1449 m_token->setForceQuirks(); 1450 HTML_ADVANCE_TO(BogusDOCTYPEState); 1451 } 1452 } 1453 END_STATE() 1454 1455 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1456 if (isTokenizerWhitespace(cc)) 1457 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1458 else if (cc == '"') { 1459 parseError(); 1460 m_token->setSystemIdentifierToEmptyString(); 1461 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1462 } else if (cc == '\'') { 1463 parseError(); 1464 m_token->setSystemIdentifierToEmptyString(); 1465 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1466 } else if (cc == '>') { 1467 parseError(); 1468 m_token->setForceQuirks(); 1469 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1470 } else if (cc == kEndOfFileMarker) { 1471 parseError(); 1472 m_token->setForceQuirks(); 1473 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1474 } else { 1475 parseError(); 1476 m_token->setForceQuirks(); 1477 HTML_ADVANCE_TO(BogusDOCTYPEState); 1478 } 1479 } 1480 END_STATE() 1481 1482 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1483 if (isTokenizerWhitespace(cc)) 1484 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1485 if (cc == '"') { 1486 m_token->setSystemIdentifierToEmptyString(); 1487 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1488 } else if (cc == '\'') { 1489 m_token->setSystemIdentifierToEmptyString(); 1490 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1491 } else if (cc == '>') { 1492 parseError(); 1493 m_token->setForceQuirks(); 1494 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1495 } else if (cc == kEndOfFileMarker) { 1496 parseError(); 1497 m_token->setForceQuirks(); 1498 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1499 } else { 1500 parseError(); 1501 m_token->setForceQuirks(); 1502 HTML_ADVANCE_TO(BogusDOCTYPEState); 1503 } 1504 } 1505 END_STATE() 1506 1507 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1508 if (cc == '"') 1509 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1510 else if (cc == '>') { 1511 parseError(); 1512 m_token->setForceQuirks(); 1513 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1514 } else if (cc == kEndOfFileMarker) { 1515 parseError(); 1516 m_token->setForceQuirks(); 1517 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1518 } else { 1519 m_token->appendToSystemIdentifier(cc); 1520 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1521 } 1522 } 1523 END_STATE() 1524 1525 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1526 if (cc == '\'') 1527 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1528 else if (cc == '>') { 1529 parseError(); 1530 m_token->setForceQuirks(); 1531 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1532 } else if (cc == kEndOfFileMarker) { 1533 parseError(); 1534 m_token->setForceQuirks(); 1535 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1536 } else { 1537 m_token->appendToSystemIdentifier(cc); 1538 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1539 } 1540 } 1541 END_STATE() 1542 1543 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1544 if (isTokenizerWhitespace(cc)) 1545 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1546 else if (cc == '>') 1547 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1548 else if (cc == kEndOfFileMarker) { 1549 parseError(); 1550 m_token->setForceQuirks(); 1551 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1552 } else { 1553 parseError(); 1554 HTML_ADVANCE_TO(BogusDOCTYPEState); 1555 } 1556 } 1557 END_STATE() 1558 1559 HTML_BEGIN_STATE(BogusDOCTYPEState) { 1560 if (cc == '>') 1561 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1562 else if (cc == kEndOfFileMarker) 1563 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1564 HTML_ADVANCE_TO(BogusDOCTYPEState); 1565 } 1566 END_STATE() 1567 1568 HTML_BEGIN_STATE(CDATASectionState) { 1569 if (cc == ']') 1570 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState); 1571 else if (cc == kEndOfFileMarker) 1572 HTML_RECONSUME_IN(DataState); 1573 else { 1574 bufferCharacter(cc); 1575 HTML_ADVANCE_TO(CDATASectionState); 1576 } 1577 } 1578 END_STATE() 1579 1580 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) { 1581 if (cc == ']') 1582 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1583 else { 1584 bufferCharacter(']'); 1585 HTML_RECONSUME_IN(CDATASectionState); 1586 } 1587 } 1588 1589 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1590 if (cc == '>') 1591 HTML_ADVANCE_TO(DataState); 1592 else { 1593 bufferCharacter(']'); 1594 bufferCharacter(']'); 1595 HTML_RECONSUME_IN(CDATASectionState); 1596 } 1597 } 1598 END_STATE() 1599 1600 } 1601 1602 ASSERT_NOT_REACHED(); 1603 return false; 1604} 1605 1606String HTMLTokenizer::bufferedCharacters() const 1607{ 1608 // FIXME: Add an assert about m_state. 1609 StringBuilder characters; 1610 characters.reserveCapacity(numberOfBufferedCharacters()); 1611 characters.append('<'); 1612 characters.append('/'); 1613 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size()); 1614 return characters.toString(); 1615} 1616 1617void HTMLTokenizer::updateStateFor(const AtomicString& tagName) 1618{ 1619 if (tagName == textareaTag || tagName == titleTag) 1620 setState(HTMLTokenizer::RCDATAState); 1621 else if (tagName == plaintextTag) 1622 setState(HTMLTokenizer::PLAINTEXTState); 1623 else if (tagName == scriptTag) 1624 setState(HTMLTokenizer::ScriptDataState); 1625 else if (tagName == styleTag 1626 || tagName == iframeTag 1627 || tagName == xmpTag 1628 || (tagName == noembedTag && m_options.pluginsEnabled) 1629 || tagName == noframesTag 1630 || (tagName == noscriptTag && m_options.scriptEnabled)) 1631 setState(HTMLTokenizer::RAWTEXTState); 1632} 1633 1634inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1635{ 1636 return vectorEqualsString(m_temporaryBuffer, expectedString); 1637} 1638 1639inline void HTMLTokenizer::addToPossibleEndTag(LChar cc) 1640{ 1641 ASSERT(isEndTagBufferingState(m_state)); 1642 m_bufferedEndTagName.append(cc); 1643} 1644 1645inline bool HTMLTokenizer::isAppropriateEndTag() 1646{ 1647 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size()) 1648 return false; 1649 1650 size_t numCharacters = m_bufferedEndTagName.size(); 1651 1652 for (size_t i = 0; i < numCharacters; i++) { 1653 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i]) 1654 return false; 1655 } 1656 1657 return true; 1658} 1659 1660inline void HTMLTokenizer::parseError() 1661{ 1662 notImplemented(); 1663} 1664 1665} 1666